diff --git a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
new file mode 100644
index 0000000000000..eb4a50fd4fccd
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.30
+  - name: "exact_match,flexible-extract"
+    value: 0.465
+limit: 1319
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 6057229ac50f3..254d01edf8449 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 4ae23eff62f37..6015a83e82950 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -16,7 +16,7 @@ import numpy
 import pytest
 import yaml
 
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 4cd449b141ece..80ebb370ad461 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -10,15 +10,24 @@ set -x
 set -o pipefail
 
 check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
   if [[ $gpu_count -gt 0 ]]; then
     echo "GPU found."
   else
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -90,9 +99,15 @@ kill_gpu_processes() {
 
 
   # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi
 
   # remove vllm config file
   rm -rf ~/.config/vllm
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 415171e268b08..13fd5aa8db97b 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -63,10 +63,12 @@
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 18f582b6e4c94..a21a657c4b05e 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,10 +3,10 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -14,10 +14,10 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -31,10 +31,10 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -48,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
@@ -57,7 +57,7 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
@@ -82,7 +82,22 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
+    agents:
+      queue: neuron-postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
similarity index 79%
rename from .buildkite/run-amd-test.sh
rename to .buildkite/scripts/hardware_ci/run-amd-test.sh
index 0680bae13ddbf..368f30434aa1d 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_machete_mm.py \
   --ignore=kernels/test_mha_attn.py \
   --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_cutlass_moe.py \
+  --ignore=kernels/test_mamba_ssm_ssd.py \
+  --ignore=kernels/test_attention.py \
+  --ignore=kernels/test_block_int8.py \
+  --ignore=kernels/test_fused_quant_layernorm.py \
+  --ignore=kernels/test_int8_kernel.py \
+  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
   --ignore=kernels/test_permute_cols.py"
 fi
 
@@ -105,19 +112,33 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \
   --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
   --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
   --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 
 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -134,9 +155,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     # assign shard-id for each shard
     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
     echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
-        --device /dev/kfd --device /dev/dri \
-        --network host \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
         --shm-size=16gb \
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -163,9 +185,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     fi
   done
 else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
   docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
           --shm-size=16gb \
           --rm \
           -e HIP_VISIBLE_DEVICES=0 \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
new file mode 100755
index 0000000000000..036cfea9431cb
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
+
+# Run the image
+podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
+
+function cpu_tests() {
+
+  # offline inference
+  podman exec cpu-test-ubi9-ppc bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  podman exec cpu-test-ubi9-ppc bash -c "
+    set -e
+    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
+    pip install sentence-transformers datamodel_code_generator
+    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 40m bash -c cpu_tests
+
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
similarity index 87%
rename from .buildkite/run-cpu-test-ppc64le.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
index bc06838d804ff..a97fa502e6cfc 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@@ -10,5 +10,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
+docker build -t cpu-test -f docker/Dockerfile.s390x .
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
similarity index 86%
rename from .buildkite/run-cpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test.sh
index 05744bb5225b8..40f3df96065d1 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
-
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
  --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements/test.txt
-    pip install -r vllm/requirements/cpu.txt
     pytest -v -s tests/kernels/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/decoder_only/language -m cpu_model
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
similarity index 97%
rename from .buildkite/run-gh200-test.sh
rename to .buildkite/scripts/hardware_ci/run-gh200-test.sh
index 5c004b47778fb..8c64e14606d3b 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -9,6 +9,7 @@ python3 use_existing_torch.py
 
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t gh200-test \
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
similarity index 94%
rename from .buildkite/run-hpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-hpu-test.sh
index f83eb927aae4e..95b6ac37f1857 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,7 @@
 set -ex
 
 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
similarity index 97%
rename from .buildkite/run-neuron-test.sh
rename to .buildkite/scripts/hardware_ci/run-neuron-test.sh
index ad5ae6f415748..ec6a080eb499f 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -35,7 +35,7 @@ else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 
 # Setup cleanup
 remove_docker_container() {
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
similarity index 58%
rename from .buildkite/run-tpu-v1-test.sh
rename to .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d557feefba7aa..704bc6b7324da 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
-set -e
+set -xue
 
 # Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
@@ -17,12 +17,16 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
+    && python3 -m pip install pytest pytest-asyncio tpu-info \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo HARDWARE \
+    && tpu-info \
+    && echo TEST_0 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
     && echo TEST_1 \
-    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
@@ -32,9 +36,18 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && echo TEST_7 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
+    && echo TEST_8 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
+    && echo TEST_9 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
+    && echo TEST_10 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
+    && echo TEST_11 \
+    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
similarity index 94%
rename from .buildkite/run-xpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-xpu-test.sh
index 3a0e6bdb2caaf..f54010c4231f9 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 
 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .
 
 # Setup cleanup
 remove_docker_container() { 
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
similarity index 97%
rename from .buildkite/run-benchmarks.sh
rename to .buildkite/scripts/run-benchmarks.sh
index 1641c1faa9d6a..195a8063fd743 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
 
-# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
similarity index 96%
rename from .buildkite/run-multi-node-test.sh
rename to .buildkite/scripts/run-multi-node-test.sh
index 530bf90a855fe..49aebce786b92 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -3,7 +3,7 @@
 set -euox pipefail
 
 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
similarity index 100%
rename from .buildkite/upload-wheels.sh
rename to .buildkite/scripts/upload-wheels.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f22b2b0ab6f2f..ec00bc7f108df 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@@ -70,6 +71,7 @@ steps:
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
@@ -104,7 +106,8 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
-  mirror_hardwares: [amd]
+  torch_nightly: true
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -118,7 +121,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -135,12 +138,14 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -148,11 +153,12 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - pushd ../examples/offline_inference
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
 - label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amd]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
@@ -160,18 +166,13 @@ steps:
   - tests/tracing
   commands:
   - pytest -v -s metrics
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
   - pytest -v -s tracing
 
 ##### fast check tests  #####
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -202,12 +203,13 @@ steps:
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py
@@ -283,13 +285,21 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
+- label: PyTorch Compilation Unit Tests
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+
 - label: PyTorch Fullgraph Smoke Test # 9min
   source_file_dependencies:
   - vllm/
@@ -299,7 +309,6 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/test_pass_manager.py
 
 - label: PyTorch Fullgraph Test # 18min
   source_file_dependencies:
@@ -308,18 +317,49 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
   source_file_dependencies:
   - csrc/
-  - vllm/attention
-  - tests/kernels
+  - tests/kernels/core
   commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -335,7 +375,14 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test # 10min
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
 
 - label: Quantization Test # 33min
   source_file_dependencies:
@@ -370,12 +417,14 @@ steps:
 
 - label: OpenAI-Compatible Tool Use # 20 min
   fast_check: false
-  mirror_hardwares: [ amd ]
+  #mirror_hardwares: [ amd ]
   source_file_dependencies:
     - vllm/
     - tests/tool_use
+    - tests/mistral_tool_use
   commands:
     - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use
 
 #####  models test  #####
 
@@ -387,7 +436,9 @@ steps:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
 - label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
@@ -397,6 +448,8 @@ steps:
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
   commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install causal-conv1d
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
@@ -408,6 +461,8 @@ steps:
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
   commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install causal-conv1d
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
@@ -424,11 +479,12 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
+    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
 
 - label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
@@ -442,10 +498,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
-    # https://github.com/huggingface/transformers/issues/34307
-    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@@ -461,6 +514,7 @@ steps:
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
+  mirror_hardwares: [amd]
   optional: true
   commands:
     - echo 'Testing custom models...'
@@ -472,6 +526,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -514,8 +569,11 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
   commands:
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -530,6 +588,7 @@ steps:
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
@@ -596,8 +655,6 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 590e56c137813..34da4019687b2 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 004798a388a63..c9e4be0e7719f 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index d4113da8b5b81..b96ab40749003 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
index 713e76c1a5cec..5f0125ef98096 100644
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
   attributes:
     label: The model to consider.
diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
index 273f50d59cf76..3d31c11550167 100644
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@@ -35,7 +35,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index a20c5baf895c1..7042e81a84daa 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 
 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 54f56210b286a..2033722b5f33c 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -19,7 +19,7 @@ pull_request_rules:
       - files~=\.buildkite/
       - files~=^cmake/
       - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
       - files~=^requirements.*\.txt
       - files=setup.py
   actions:
@@ -55,11 +55,19 @@ pull_request_rules:
   description: Automatically apply structured-output label
   conditions:
     - or:
+      - files~=^benchmarks/structured_schemas/
+      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=docs/source/features/structured_outputs.md
+      - files=examples/offline_inference/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^vllm/model_executor/guided_decoding/
       - files=tests/model_executor/test_guided_processors.py
       - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
+      - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files~=^vllm/v1/structured_output/
   actions:
     label:
       add:
@@ -88,6 +96,36 @@ pull_request_rules:
       add:
         - v1
 
+- name: label-tpu
+  description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
+  conditions:
+    - or:
+      - files~=tpu.py
+      - files~=_tpu
+      - files~=tpu_
+      - files~=/tpu/
+      - files~=pallas
+  actions:
+    label:
+      add:
+        - tpu
+
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index b199d0867a648..7b1d9f69938c8 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -50,7 +50,7 @@ jobs:
         uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 
       - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 
       - name: Configuration of docker images, network and namespace for the kind cluster
         run: |
diff --git a/.gitignore b/.gitignore
index 6f5cbd0733da0..06d2b1e83b7b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -203,3 +203,6 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
+
+# Ingore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 484cd171f5f52..f76b24c025ffb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
 default_stages:
   - pre-commit # Run locally
   - manual # Run in CI
@@ -8,7 +11,6 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
-    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.9.3
   hooks:
@@ -119,6 +121,12 @@ repos:
     language: system
     always_run: true
     pass_filenames: false
+  - id: update-dockerfile-graph
+    name: Update Dockerfile dependency graph
+    entry: tools/update-dockerfile-graph.sh
+    language: script
+    files: ^docker/Dockerfile$
+    pass_filenames: false
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65d1ddbeee0b2..21464a0560d93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@@ -230,10 +230,12 @@ set(VLLM_EXT_SRC
   "csrc/cache_kernels.cu"
   "csrc/attention/paged_attention_v1.cu"
   "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/layernorm_quant_kernels.cu"
+  "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
@@ -241,6 +243,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -282,7 +285,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/mamba/causal_conv1d/causal_conv1d.cu"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
@@ -461,6 +463,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
+  #
+  # CUTLASS MoE kernels
+
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
+  # to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
+             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
   #
   # Machete kernels
 
@@ -580,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
-    set(MARLIN_MOE_SRC
-        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
-        "csrc/moe/marlin_moe_ops.cu")
 
+    #
+    # For the Marlin MOE kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MOE_MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
+    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE moe_marlin_generation_result
+        OUTPUT_VARIABLE moe_marlin_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+      )
+
+      if (NOT moe_marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin MOE generation failed."
+                            " Result: \"${moe_marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
+      else()
+        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
+        message(STATUS "Marlin MOE generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
     set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
 
-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@@ -619,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   set(VLLM_ROCM_EXT_SRC
     "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/skinny_gemms.cu"
     "csrc/rocm/attention.cu")
 
   define_gpu_extension_target(
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
deleted file mode 100644
index a10090529d8a9..0000000000000
--- a/Dockerfile.cpu
+++ /dev/null
@@ -1,69 +0,0 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
-
-FROM ubuntu:22.04 AS cpu-test-1
-
-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-RUN pip install intel_extension_for_pytorch==2.6.0
-
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
-
-FROM cpu-test-1 AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README.md b/README.md
index f2da0467e5c34..dda3ae6009f55 100644
--- a/README.md
+++ b/README.md
@@ -10,19 +10,14 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---
 
-[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-
-[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
-
----
-
 *Latest News* 🔥
-
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
@@ -103,7 +98,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing
 
 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
 
 ## Sponsors
 
@@ -126,6 +121,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
diff --git a/benchmarks/README.md b/benchmarks/README.md
index d41de1caa04c0..4a8ab895e18e9 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -41,29 +41,39 @@ become available.
       <td><code>synthetic</code></td>
     </tr>
     <tr>
-      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🟡</td>
-      <td>Specify your dataset path on HuggingFace</td>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
     </tr>
     <tr>
-      <td><strong>VisionArena</strong></td>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
+      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Other</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
   </tbody>
 </table>
 
 ✅: supported
 
+🟡: Partial support
+
 🚧: to be supported
 
-🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
-If you need support for other dataset formats, please consider contributing.
-
-**Note**: VisionArena’s `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 
 ---
 ## Example - Online Benchmark
@@ -71,8 +81,7 @@ If you need support for other dataset formats, please consider contributing.
 First start serving your model
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-vllm serve ${MODEL_NAME} --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
 
 Then run the benchmarking script
@@ -80,12 +89,13 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -122,88 +132,105 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT='train'
-
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
 ```
 
-### HuggingFaceDataset Examples
+### InstructCoder Benchmark with Speculative Decoding
 
-Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
-formats, please consider contributing.
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-model "[ngram]" \
+    --ngram_prompt_lookup_min 2 \
+    --ngram-prompt-lookup-max 5 \
+    --num_speculative_tokens 5
+```
+
+``` bash
+python3 benchmarks/benchmark_serving.py \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+### Other HuggingFaceDataset Examples
 
 ```bash
-# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
-DATASET_SPLIT='train'
-DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-subset "${DATASET_SUBSET}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
 ```
 
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
-DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
 ```
 
 ---
 ## Example - Offline Throughput Benchmark
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-DATASET_NAME="sonnet"
-DATASET_PATH="vllm/benchmarks/sonnet.txt"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -217,19 +244,13 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models
 
 ``` bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT="train"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "vllm-chat" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-split "${DATASET_SPLIT}"
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
 ```
 
 The `num prompt tokens` now includes image token counts
@@ -240,29 +261,83 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+python3 vllm/benchmarks/benchmark_throughput.py \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-model="[ngram]" \
+    --ngram_prompt_lookup_min=2 \
+    --ngram-prompt-lookup-max=5 \
+    --num_speculative_tokens=5
+```
+
+```
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+### Other HuggingFaceDataset Examples
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters
 
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="meta-llama/Llama-2-7b-hf"
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-NUM_PROMPTS=10
-MAX_LORAS=2
-MAX_LORA_RANK=8
-ENABLE_LORA="--enable-lora"
-LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "${BACKEND}" \
-  --dataset_path "${DATASET_PATH}" \
-  --dataset_name "${DATASET_NAME}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --max-loras "${MAX_LORAS}" \
-  --max-lora-rank "${MAX_LORA_RANK}" \
-  ${ENABLE_LORA} \
-  --lora-path "${LORA_PATH}"
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
   ```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0f13c79ae234b..efd51c79c37cf 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import io
 import json
 import os
 import sys
@@ -32,6 +33,7 @@ class RequestFuncInput:
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
+    language: Optional[str] = None
 
 
 @dataclass
@@ -219,7 +221,15 @@ async def async_request_deepspeed_mii(
                 if response.status == 200:
                     parsed_resp = await response.json()
                     output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
                     output.success = True
                 else:
                     output.error = response.reason or ""
@@ -428,6 +438,110 @@ async def async_request_openai_chat_completions(
     return output
 
 
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations"
+         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+            form = aiohttp.FormData()
+            form.add_field('file', f, content_type='audio/wav')
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
@@ -485,7 +599,14 @@ ASYNC_REQUEST_FUNCS = {
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 0567875f9862f..ccbc6c022f1f9 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -23,7 +23,8 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from typing import Any, Optional, Union
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -63,6 +64,7 @@ class SampleRequest:
 
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
 
     def __init__(
         self,
@@ -239,21 +241,24 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    For a PIL.Image.Image input:
-      - Converts the image to RGB.
-      - Saves the image as a JPEG in-memory.
-      - Encodes the JPEG data as a base64 string.
-      - Returns a dictionary with the image as a base64 data URL.
+    Supports three input types:
 
-    For a string input:
-      - Treats the string as a URL or file path.
-      - Prepends "file://" if the string doesn't start with "http://" or
-        "file://".
-      - Returns a dictionary with the image URL.
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
 
     Raises:
-      ValueError: If the input is neither a PIL.Image.Image nor a string.
+        ValueError: If the input is not a supported type.
     """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
     if isinstance(image, Image.Image):
         image = image.convert("RGB")
         with io.BytesIO() as image_data:
@@ -272,8 +277,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
             ("http://", "file://")) else f"file://{image}")
         return {"type": "image_url", "image_url": {"url": image_url}}
 
-    raise ValueError(
-        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
 
 
 # -----------------------------------------------------------------------------
@@ -284,7 +289,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
     # Default values copied from benchmark_serving.py for the random dataset.
     DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_RANGE_RATIO = 0.0
     DEFAULT_INPUT_LEN = 1024
     DEFAULT_OUTPUT_LEN = 128
 
@@ -304,19 +309,32 @@ class RandomDataset(BenchmarkDataset):
         output_len: int = DEFAULT_OUTPUT_LEN,
         **kwargs,
     ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
         vocab_size = tokenizer.vocab_size
 
         prefix_token_ids = (np.random.randint(
             0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
 
-        input_low = int(input_len * range_ratio)
-        output_low = int(output_len * range_ratio)
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(input_len * (1 - range_ratio))
+        input_high = int(input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low,
+                    output_high)
 
         input_lens = np.random.randint(input_low,
-                                       input_len + 1,
+                                       input_high + 1,
                                        size=num_requests)
         output_lens = np.random.randint(output_low,
-                                        output_len + 1,
+                                        output_high + 1,
                                         size=num_requests)
         offsets = np.random.randint(0, vocab_size, size=num_requests)
 
@@ -468,11 +486,11 @@ class SonnetDataset(BenchmarkDataset):
 
         # Determine how many poem lines to use.
         num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
-        for _ in range(num_requests):
+        while len(samples) < num_requests:
             extra_lines = random.choices(self.data,
                                          k=num_input_lines - num_prefix_lines)
             prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@@ -480,13 +498,14 @@ class SonnetDataset(BenchmarkDataset):
             prompt_formatted = tokenizer.apply_chat_template(
                 msg, add_generation_prompt=True, tokenize=False)
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            samples.append(
-                SampleRequest(
-                    prompt=prompt_formatted
-                    if return_prompt_formatted else prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted
+                        if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    ))
         return samples
 
 
@@ -562,48 +581,48 @@ class BurstGPTDataset(BenchmarkDataset):
 
 
 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Implementation
+# HuggingFace Dataset Base Implementation
 # -----------------------------------------------------------------------------
-
-
 class HuggingFaceDataset(BenchmarkDataset):
-    """
-    Dataset class for processing a HuggingFace dataset with conversation data
-    and optional images.
-    """
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
 
     def __init__(
         self,
+        dataset_path: str,
         dataset_split: str,
         dataset_subset: Optional[str] = None,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
-
         self.load_data()
 
     def load_data(self) -> None:
-        if not self.dataset_path:
-            raise ValueError("dataset_path must be provided for loading data.")
-
+        """Load data from HuggingFace datasets."""
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
             split=self.dataset_split,
             streaming=True,
         )
-        if self.data.features is None or "conversations" \
-            not in self.data.features:
-            raise ValueError(
-                "HuggingFaceDataset currently only supports datasets with "
-                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
-                "Please consider contributing if you would like to add "
-                "support for additional dataset formats.")
-        # Shuffle and filter examples with at least 2 conversations.
-        self.data = self.data.shuffle(seed=self.random_seed).filter(
-            lambda x: len(x["conversations"]) >= 2)
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
+    IS_MULTIMODAL = True
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -611,10 +630,13 @@ class HuggingFaceDataset(BenchmarkDataset):
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         dynamic_output = output_len is None
 
-        for item in self.data:
+        for item in filtered_data:
             if len(sampled_requests) >= num_requests:
                 break
             conv = item["conversations"]
@@ -659,29 +681,13 @@ class VisionArenaDataset(HuggingFaceDataset):
     """
 
     DEFAULT_OUTPUT_LEN = 128
-    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
-            raise ValueError(f"Only support Vision Arena dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-        self.load_data()
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
+    IS_MULTIMODAL = True
 
     def sample(
         self,
@@ -697,7 +703,11 @@ class VisionArenaDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = item["turns"][0][0]["content"]
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
             mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
             if enable_multimodal_chat:
@@ -715,3 +725,173 @@ class VisionArenaDataset(HuggingFaceDataset):
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """ # noqa: E501
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
+        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
+                              "<|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning("%d samples discarded from dataset due to" \
+                           " their length being greater than" \
+                           " what Whisper supports.", skipped)
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 69cb7d7038cf7..5543ccffbf078 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,9 +7,6 @@ On the server side, run one of the following commands:
         --swap-space 16 \
         --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving.py \
         --backend <backend> \
@@ -37,7 +34,8 @@ from datetime import datetime
 from typing import Any, Optional
 
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
+                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                   RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -52,9 +50,11 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
+                               ConversationDataset, HuggingFaceDataset,
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -156,7 +156,7 @@ def calculate_metrics(
         if outputs[i].success:
             output_len = outputs[i].output_tokens
 
-            if output_len is None:
+            if not output_len:
                 # We use the tokenizer to count the number of output tokens
                 # for some serving backends instead of looking at
                 # len(outputs[i].itl) since multiple output tokens may be
@@ -261,6 +261,7 @@ async def benchmark(
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
     lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -273,10 +274,6 @@ async def benchmark(
         input_requests[0].expected_output_len, \
             input_requests[0].multi_modal_data
 
-    if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
-        raise ValueError(
-            "Multi-modal content is only supported on 'openai-chat' backend.")
     assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
@@ -288,6 +285,7 @@ async def benchmark(
         logprobs=logprobs,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
+        extra_body=extra_body,
     )
 
     test_output = await request_func(request_func_input=test_input)
@@ -314,7 +312,8 @@ async def benchmark(
                                          output_len=test_output_len,
                                          logprobs=logprobs,
                                          multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
             print("Profiler started")
@@ -364,7 +363,8 @@ async def benchmark(
                                               output_len=output_len,
                                               logprobs=logprobs,
                                               multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -586,19 +586,49 @@ def main(args: argparse.Namespace):
                                             return_prompt_formatted=True)
 
     elif args.dataset_name == "hf":
-        # Choose between VisionArenaDataset
-        # and HuggingFaceDataset based on provided parameters.
-        dataset_class = (VisionArenaDataset if args.dataset_path
-                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                         and args.hf_subset is None else HuggingFaceDataset)
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
+        if (dataset_class.IS_MULTIMODAL and backend not in \
+            ["openai-chat", "openai-audio"]):
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and " \
+                "'openai-audio' backend.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
             dataset_split=args.hf_split,
+            random_seed=args.seed,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
-            random_seed=args.seed,
             output_len=args.hf_output_len,
         )
 
@@ -633,6 +663,26 @@ def main(args: argparse.Namespace):
             raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible "
+            "backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -659,6 +709,7 @@ def main(args: argparse.Namespace):
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
+            extra_body=sampling_params,
         ))
 
     # Save config and results to json
@@ -881,7 +932,7 @@ if __name__ == "__main__":
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
         "Default value is \"ttft,tpot,itl\".")
@@ -889,7 +940,7 @@ if __name__ == "__main__":
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
@@ -956,18 +1007,23 @@ if __name__ == "__main__":
     random_group.add_argument(
         "--random-range-ratio",
         type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
     random_group.add_argument(
         "--random-prefix-len",
         type=int,
         default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
+        help=("Number of fixed prefix tokens before the random context "
+              "in a request. "
+              "The total input length is the sum of `random-prefix-len` and "
+              "a random "
+              "context length sampled from [input_len * (1 - range_ratio), "
+              "input_len * (1 + range_ratio)]."),
+    )
 
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
@@ -986,6 +1042,33 @@ if __name__ == "__main__":
         "from the sampled HF dataset.",
     )
 
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).")
+
     parser.add_argument(
         '--tokenizer-mode',
         type=str,
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index c79a93faff197..5dd9b1dbd4611 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -5,16 +5,13 @@ On the server side, run one of the following commands:
     (vLLM OpenAI API server)
     vllm serve <your_model> --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving_structured_output.py \
         --backend <backend> \
         --model <your_model> \
         --dataset json \
         --structured-output-ratio 1.0 \
-        --structured-output-backend xgrammar \
+        --structured-output-backend auto \
         --request-rate 10 \
         --num-prompts 1000
 
@@ -54,7 +51,7 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -133,10 +130,11 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                         "description":
                         "An unique optional field to avoid cached schemas"
                     }
+        else:
+            json_schemas = [schema] * args.num_prompts
 
         def gen_prompt(index: int):
-            schema = json_schemas[index % len(json_schemas)]
-            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
 
         def get_schema(index: int):
             return json_schemas[index % len(json_schemas)]
@@ -966,7 +964,7 @@ if __name__ == "__main__":
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
         "Default value is \"ttft,tpot,itl\".")
@@ -974,7 +972,7 @@ if __name__ == "__main__":
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
@@ -999,12 +997,14 @@ if __name__ == "__main__":
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument(
-        "--structured-output-backend",
-        type=str,
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
-        default="xgrammar",
-        help="Backend to use for structured outputs")
+    parser.add_argument("--structured-output-backend",
+                        type=str,
+                        choices=[
+                            "outlines", "lm-format-enforcer", "xgrammar",
+                            "guidance", "auto"
+                        ],
+                        default="auto",
+                        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 53869db478c51..1f65277e1bfeb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,8 @@ from typing import Any, Optional, Union
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, InstructCoderDataset,
                                RandomDataset, SampleRequest, ShareGPTDataset,
                                SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@@ -212,14 +213,17 @@ def run_hf(
     max_prompt_len = 0
     max_output_len = 0
     for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
         # Add the prompt to the batch.
         batch.append(prompt)
         max_prompt_len = max(max_prompt_len, prompt_len)
         max_output_len = max(max_output_len, output_len)
         if len(batch) < max_batch_size and i != len(requests) - 1:
             # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
             if (max(max_prompt_len, next_prompt_len) +
                     max(max_output_len, next_output_len)) <= 2048:
                 # We can add more requests to the batch.
@@ -300,6 +304,7 @@ def get_requests(args, tokenizer):
         "input_len": args.input_len,
         "output_len": args.output_len,
     }
+
     if args.dataset_path is None or args.dataset_name == "random":
         sample_kwargs["range_ratio"] = args.random_range_ratio
         sample_kwargs["prefix_len"] = args.prefix_len
@@ -317,18 +322,23 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
-        if args.backend != "vllm-chat":
-            raise ValueError(
-                "hf datasets only are supported by vllm-chat backend")
-        # Choose between VisionArenaDataset and HuggingFaceDataset based on
-        # provided parameters.
-        dataset_cls = (VisionArenaDataset if args.dataset_path
-                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                       and args.hf_subset is None else HuggingFaceDataset)
-        common_kwargs['dataset_subset'] = args.hf_subset
-        common_kwargs['dataset_split'] = args.hf_split
-        sample_kwargs["enable_multimodal_chat"] = True
-
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -462,9 +472,17 @@ def validate_args(args):
         warnings.warn("--hf-subset and --hf-split will be ignored \
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
-    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
-        raise ValueError(
-            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
 
     # --random-range-ratio: only used when dataset_name is 'random'
     if args.dataset_name != 'random' and args.random_range_ratio is not None:
@@ -505,6 +523,13 @@ def validate_args(args):
         raise ValueError(
             "Tokenizer must be the same as the model for MII backend.")
 
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead")
+
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description="Benchmark the throughput.")
@@ -576,18 +601,30 @@ if __name__ == "__main__":
         default=None,
         help="Path to the lora adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument("--prefix-len",
-                        type=int,
-                        default=None,
-                        help="Number of prefix tokens per request."
-                        "This is for the RandomDataset and SonnetDataset")
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=None,
+        help=f"Number of prefix tokens to be used in RandomDataset "
+        "and SonnetDataset. For RandomDataset, the total input "
+        "length is the sum of prefix-len (default: "
+        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
+        "sampled from [input_len * (1 - range_ratio), "
+        "input_len * (1 + range_ratio)]. For SonnetDataset, "
+        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
+        "controls how much of the input is fixed lines versus "
+        "random lines, but the total input length remains approximately "
+        "input_len tokens.")
     # random dataset
     parser.add_argument(
         "--random-range-ratio",
         type=float,
         default=None,
-        help="Range of sampled ratio of input/output length, "
-        "used only for RandomDataSet.",
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
+        "for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to "
+        "define a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
 
     # hf dtaset
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
new file mode 100644
index 0000000000000..b23b4f3ea685a
--- /dev/null
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION)
+
+try:
+    import bitblas
+    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        raise ImportError("bitblas version is wrong. Please "
+                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+except ImportError as e:
+    bitblas_import_exception = e
+    raise ValueError("Trying to use the bitblas backend, but could not import"
+                     f"with the following error: {bitblas_import_exception}. "
+                     "Please install bitblas through the following command: "
+                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+                     ) from bitblas_import_exception
+
+from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
+
+from vllm.utils import FlexibleArgumentParser
+
+parser = FlexibleArgumentParser(
+    description="Benchmark BitBLAS int4 on a specific target.")
+
+# Add arguments to the parser
+parser.add_argument(
+    "--target",
+    type=str,
+    default=auto_detect_nvidia_target(),
+    help="Specify the target device for benchmarking.",
+)
+parser.add_argument("--group_size",
+                    type=int,
+                    default=None,
+                    help="Group size for grouped quantization.")
+parser.add_argument(
+    "--A_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "float64", "int32", "int8"],
+    help="Data type of activation A.",
+)
+parser.add_argument(
+    "--W_dtype",
+    type=str,
+    default="int4",
+    choices=[
+        "float16",
+        "float32",
+        "float64",
+        "int32",
+        "int8",
+        "int4",
+        "int2",
+        "int1",
+        "nf4",
+        "fp4_e2m1",
+    ],
+    help="Data type of weight W.",
+)
+parser.add_argument(
+    "--accum_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "int32"],
+    help="Data type for accumulation.",
+)
+parser.add_argument(
+    "--out_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "int32", "int8"],
+    help="Data type for output.",
+)
+parser.add_argument(
+    "--layout",
+    type=str,
+    default="nt",
+    choices=["nt", "nn"],
+    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
+)
+parser.add_argument("--with_bias",
+                    action="store_true",
+                    help="Include bias in the benchmark.")
+parser.add_argument(
+    "--with_scaling",
+    action="store_true",
+    help="Include scaling factor in the quantization.",
+)
+parser.add_argument("--with_zeros",
+                    action="store_true",
+                    help="Include zeros in the quantization.")
+parser.add_argument(
+    "--zeros_mode",
+    type=str,
+    default=None,
+    choices=["original", "rescale", "quantized"],
+    help="Specify the mode for calculating zeros.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign arguments to variables
+target = args.target
+A_dtype = args.A_dtype
+W_dtype = args.W_dtype
+accum_dtype = args.accum_dtype
+out_dtype = args.out_dtype
+layout = args.layout
+with_bias = args.with_bias
+group_size = args.group_size
+with_scaling = args.with_scaling
+with_zeros = args.with_zeros
+zeros_mode = args.zeros_mode
+
+# Define a list of shared arguments that repeat in every config
+shared_args = [
+    A_dtype,
+    W_dtype,
+    out_dtype,
+    accum_dtype,
+    layout,
+    with_bias,
+    group_size,
+    with_scaling,
+    with_zeros,
+    zeros_mode,
+]
+
+# Define just the (M, K, N) shapes in a more compact list
+shapes = [
+    # square test
+    (1, 16384, 16384),
+    # BLOOM-176B
+    (1, 43008, 14336),
+    (1, 14336, 14336),
+    (1, 57344, 14336),
+    (1, 14336, 57344),
+    # OPT-65B
+    (1, 9216, 9216),
+    (1, 36864, 9216),
+    (1, 9216, 36864),
+    (1, 22016, 8192),
+    # LLAMA-70B/65B
+    (1, 8192, 22016),
+    (1, 8192, 8192),
+    (1, 28672, 8192),
+    (1, 8192, 28672),
+    # square test
+    (16384, 16384, 16384),
+    # BLOOM-176B
+    (8192, 43008, 14336),
+    (8192, 14336, 14336),
+    (8192, 57344, 14336),
+    (8192, 14336, 57344),
+    # OPT-65B
+    (8192, 9216, 9216),
+    (8192, 36864, 9216),
+    (8192, 9216, 36864),
+    (8192, 22016, 8192),
+    # LLAMA-70B/65B
+    (8192, 8192, 22016),
+    (8192, 8192, 8192),
+    (8192, 28672, 8192),
+    (8192, 8192, 28672),
+]
+
+# Build test shapes with all the shared arguments
+test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
+               for shape in shapes]
+
+benchmark_sets = []
+benchmark_sets.extend(test_shapes)
+
+benchmark_results = {}
+for config_class, operator, input_args in benchmark_sets:
+    config = config_class(*input_args)
+    matmul = operator(config, target=target, enable_tuning=True)
+    kernel_latency = matmul.profile_latency()
+
+    print("Time cost is: {:.3f} ms".format(kernel_latency))
+
+    profile_config = {
+        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
+            "BitBLAS_top20_latency": kernel_latency,
+        }
+    }
+
+    benchmark_results.update(profile_config)
+
+# Define headers for the table
+headers = [
+    "PrimFunc",
+    "Input Arguments",
+    "BitBLAS Top20 Latency",
+]
+
+# Calculate column widths for pretty printing
+col_widths = [0, 0, 0]
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
+    col_widths[1] = max(col_widths[1],
+                        len(input_args_str) + 2,
+                        len(headers[1]) + 2)
+    col_widths[2] = max(col_widths[2],
+                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
+                        len(headers[2]) + 2)
+    # break only if you want to measure widths from a single example;
+    # otherwise, let it loop over all items.
+
+# Print header
+for i, header in enumerate(headers):
+    headers[i] = header.ljust(col_widths[i])
+print("".join(headers))
+print("-" * sum(col_widths))
+
+# Print rows
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    row = [
+        func_name,
+        input_args_str,
+        f"{values['BitBLAS_top20_latency']:.3f} ms",
+    ]
+    row_str = "".join(
+        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
+    print(row_str)
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
new file mode 100644
index 0000000000000..bcdbf6c7551a3
--- /dev/null
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES_MOE
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
+                                                            fused_experts,
+                                                            fused_topk)
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = [
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
+    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
+]
+DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def bench_run(results: list[benchmark.Measurement], model: str,
+              num_experts: int, topk: int, per_act_token: bool,
+              per_out_ch: bool, mkn: tuple[int, int, int]):
+    label = "Quant Matmul"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
+        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
+                          mkn))
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
+
+    _, a_scale = ops.scaled_fp8_quant(a)
+
+    w1_q = torch.empty((num_experts, 2 * n, k),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w2_q = torch.empty((num_experts, k, n),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w1_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+
+    ab_strides1 = torch.full((num_experts, ),
+                             k,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides1 = torch.full((num_experts, ),
+                            2 * n,
+                            device="cuda",
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts, ),
+                             n,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides2 = torch.full((num_experts, ),
+                            k,
+                            device="cuda",
+                            dtype=torch.int64)
+
+    for expert in range(num_experts):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+    w1_q_notransp = w1_q.clone()
+    w2_q_notransp = w2_q.clone()
+    w1_q = w1_q.transpose(1, 2)
+    w2_q = w2_q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                       a_scale: torch.Tensor, num_repeats: int):
+        for _ in range(num_repeats):
+            fused_experts(a,
+                          w1,
+                          w2,
+                          topk_weights,
+                          topk_ids,
+                          use_fp8_w8a8=True,
+                          w1_scale=w1_scale,
+                          w2_scale=w2_scale,
+                          a1_scale=a_scale)
+
+    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
+                        w1: torch.Tensor, w2: torch.Tensor,
+                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
+                        num_repeats: int):
+        for _ in range(num_repeats):
+            cutlass_moe_fp8(a,
+                            w1,
+                            w2,
+                            w1_scale,
+                            w2_scale,
+                            topk_weights,
+                            topk_ids,
+                            ab_strides1,
+                            c_strides1,
+                            ab_strides2,
+                            c_strides2,
+                            a1_scale=a_scale)
+
+    def run_cutlass_from_graph(
+            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return cutlass_moe_fp8(a,
+                                   w1_q,
+                                   w2_q,
+                                   w1_scale,
+                                   w2_scale,
+                                   topk_weights,
+                                   topk_ids,
+                                   ab_strides1,
+                                   c_strides1,
+                                   ab_strides2,
+                                   c_strides2,
+                                   a1_scale=a_scale)
+
+    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
+                              w2: torch.Tensor, topk_weights: torch.Tensor,
+                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
+                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return fused_experts(a,
+                                 w1,
+                                 w2,
+                                 topk_weights,
+                                 topk_ids,
+                                 use_fp8_w8a8=True,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=a_scale)
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
+                               topk_weights, topk_ids, ab_strides1, c_strides1,
+                               ab_strides2, c_strides2)
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
+                              topk_ids, w1_scale, w2_scale, a_scale)
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_q_notransp": w1_q_notransp,
+        "w2_q_notransp": w2_q_notransp,
+        # Cutlass params
+        "a_scale": a_scale,
+        "w1_q": w1_q,
+        "w2_q": w2_q,
+        "w1_scale": w1_scale,
+        "w2_scale": w2_scale,
+        "ab_strides1": ab_strides1,
+        "c_strides1": c_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides2": c_strides2,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "a": a,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe": run_cutlass_moe,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
+                   w1_scale, w2_scale, a_scale, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
+                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
+                    num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in DEFAULT_BATCH_SIZES:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(results, model, num_experts, topk,
+                                      per_act_token, per_out_ch, mkn)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_TP_SIZES)
+    parser.add_argument("--batch-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token",
+                        nargs="+",
+                        type=int,
+                        default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 491f8c3962f73..afe0b53077a70 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
     num_stages: int
 
 
-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    num_iters: int = 100,
-    block_quant_shape: List[int] = None,
-) -> float:
+def benchmark_config(config: BenchmarkConfig,
+                     num_tokens: int,
+                     num_experts: int,
+                     shard_intermediate_size: int,
+                     hidden_size: int,
+                     topk: int,
+                     dtype: torch.dtype,
+                     use_fp8_w8a8: bool,
+                     use_int8_w8a16: bool,
+                     num_iters: int = 100,
+                     block_quant_shape: List[int] = None,
+                     use_deep_gemm: bool = False) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     if use_int8_w8a16:
@@ -115,22 +114,41 @@ def benchmark_config(
     def run():
         from vllm.model_executor.layers.fused_moe import override_config
         with override_config(config):
-            fused_moe(
-                x,
-                w1,
-                w2,
-                input_gating,
-                topk,
-                renormalize=True,
-                inplace=True,
-                use_fp8_w8a8=use_fp8_w8a8,
-                use_int8_w8a16=use_int8_w8a16,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                block_shape=block_quant_shape,
-            )
+            if use_deep_gemm:
+                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
+                                                    False)
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )
 
     # JIT compilation & warmup
     run()
@@ -366,6 +384,7 @@ class BenchmarkWorker:
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
         block_quant_shape: List[int] = None,
+        use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
         dtype_str = get_config_dtype_str(dtype,
@@ -396,7 +415,8 @@ class BenchmarkWorker:
                                        use_fp8_w8a8,
                                        use_int8_w8a16,
                                        num_iters=100,
-                                       block_quant_shape=block_quant_shape)
+                                       block_quant_shape=block_quant_shape,
+                                       use_deep_gemm=use_deep_gemm)
         return config, kernel_time
 
     def tune(
@@ -411,6 +431,7 @@ class BenchmarkWorker:
         use_int8_w8a16: bool,
         search_space: list[dict[str, int]],
         block_quant_shape: list[int],
+        use_deep_gemm: bool,
     ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
@@ -436,7 +457,8 @@ class BenchmarkWorker:
                         use_fp8_w8a8,
                         use_int8_w8a16,
                         num_iters=20,
-                        block_quant_shape=block_quant_shape)
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm)
                 except triton.runtime.autotuner.OutOfResources:
                     # Some configurations may be invalid and fail to compile.
                     continue
@@ -531,6 +553,9 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
+        if not hasattr(config, "hidden_size"):
+            # Support for llama4
+            config = config.text_config
         # Default: Mixtral.
         E = config.num_local_experts
         topk = config.num_experts_per_tok
@@ -550,6 +575,8 @@ def main(args: argparse.Namespace):
     else:
         batch_sizes = [args.batch_size]
 
+    use_deep_gemm = bool(args.use_deep_gemm)
+
     ray.init()
     num_gpus = int(ray.available_resources()["GPU"])
     workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@@ -572,10 +599,10 @@ def main(args: argparse.Namespace):
 
         start = time.time()
         configs = _distribute(
-            "tune",
-            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
-             for batch_size in batch_sizes])
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
+                      block_quant_shape, use_deep_gemm)
+                     for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
@@ -589,7 +616,7 @@ def main(args: argparse.Namespace):
         outputs = _distribute(
             "benchmark",
             [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
              for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@@ -611,6 +638,7 @@ if __name__ == "__main__":
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],
                         default="auto")
+    parser.add_argument("--use-deep-gemm", action="store_true")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 48b351bc48141..2625239b08ef2 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -7,10 +7,13 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                         create_kv_caches_with_random)
 
+logger = init_logger(__name__)
+
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@@ -193,6 +196,9 @@ def main(
 
 
 if __name__ == '__main__':
+    logger.warning("This script benchmarks the paged attention kernel. "
+                   "By default this is no longer used in vLLM inference.")
+
     parser = FlexibleArgumentParser(
         description="Benchmark the paged attention kernel.")
     parser.add_argument("--version",
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index c375e61e41873..70190ba24d9df 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -75,3 +75,19 @@ WEIGHT_SHAPES = {
         [7168, 8192],
     ],
 }
+
+WEIGHT_SHAPES_MOE = {
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+        [8, 2, 4096, 28672],
+        [8, 2, 14336, 4096],
+    ],
+    "nm-testing/deepseekv2-lite": [
+        [64, 6, 2048, 1408],
+    ],
+    "ibm-granite/granite-3.0-1b-a400m": [
+        [32, 8, 1024, 1024],
+    ],
+    "ibm-granite/granite-3.0-3b-a800m": [
+        [40, 8, 1024, 1536],
+    ],
+}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
deleted file mode 100755
index ba7383d88dc49..0000000000000
--- a/benchmarks/launch_tgi_server.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-PORT=8000
-MODEL=$1
-TOKENS=$2
-
-docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-           -v "$PWD/data:/data" \
-           ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id "$MODEL" \
-           --sharded false  \
-           --max-input-length 1024 \
-           --max-total-tokens 2048 \
-           --max-best-of 5 \
-           --max-concurrent-requests 5000 \
-           --max-batch-total-tokens "$TOKENS"
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index b57d9e2263109..00670bd398b5d 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,8 +33,6 @@ endif()
 
 if(MACOSX_FOUND)
     list(APPEND CXX_COMPILE_FLAGS
-        "-Xpreprocessor"
-        "-fopenmp"
         "-DVLLM_CPU_EXTENSION")
 else()
     list(APPEND CXX_COMPILE_FLAGS
@@ -197,6 +195,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/quant.cpp"
+        "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
 endif()
 
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index afd7c47e8ac00..110ef266c6653 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
+          GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
new file mode 100644
index 0000000000000..14e5edd7e283d
--- /dev/null
+++ b/csrc/attention/merge_attn_states.cu
@@ -0,0 +1,178 @@
+#include <optional>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+
+#include "attention_dtypes.h"
+#include "attention_utils.cuh"
+
+namespace vllm {
+
+// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+// can be used to combine partial attention results (in the split-KV case)
+template <typename scalar_t, const uint NUM_THREADS>
+__global__ void merge_attn_states_kernel(
+    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
+    const float* prefix_lse, const scalar_t* suffix_output,
+    const float* suffix_lse, const uint num_tokens, const uint num_heads,
+    const uint head_size) {
+  using pack_128b_t = uint4;
+  const uint pack_size = 16 / sizeof(scalar_t);
+  const uint threads_per_head = head_size / pack_size;
+
+  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
+  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
+
+  if (global_idx >= token_head_threads) return;
+
+  // global_idx -> token_idx + head_idx + pack_idx
+  const uint token_head_idx = global_idx / threads_per_head;
+  const uint pack_idx = global_idx % threads_per_head;
+
+  const uint token_idx = token_head_idx / num_heads;
+  const uint head_idx = token_head_idx % num_heads;
+
+  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
+  const uint head_offset =
+      token_idx * num_heads * head_size + head_idx * head_size;
+  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
+  scalar_t* output_head_ptr = output + head_offset;
+
+  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
+  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
+  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
+  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
+
+  const float max_lse = fmaxf(p_lse, s_lse);
+  p_lse = p_lse - max_lse;
+  s_lse = s_lse - max_lse;
+  const float p_se = expf(p_lse);
+  const float s_se = expf(s_lse);
+  const float out_se = p_se + s_se;
+  const float p_scale = p_se / out_se;
+  const float s_scale = s_se / out_se;
+
+  if (pack_offset < head_size) {
+    // Pack 128b load
+    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+        prefix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
+        suffix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t o_out_pack;
+
+#pragma unroll
+    for (uint i = 0; i < pack_size; ++i) {
+      // Always use float for FMA to keep high precision.
+      // half(uint16_t), bfloat16, float -> float.
+      const float p_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
+      const float s_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
+      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
+      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
+      // float -> half(uint16_t), bfloat16, float.
+      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
+    }
+
+    // Pack 128b storage
+    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+        o_out_pack;
+  }
+  // We only need to write to output_lse once per head.
+  if (output_lse != nullptr && pack_idx == 0) {
+    float out_lse = logf(out_se) + max_lse;
+    output_lse[head_idx * num_tokens + token_idx] = out_lse;
+  }
+}
+
+}  // namespace vllm
+
+// The following macro is used to dispatch the conversion function based on
+// the output data type. The FN is a macro that calls a function with
+// template<typename scalar_t>.
+#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
+  {                                                                     \
+    if (scalar_dtype == at::ScalarType::Float) {                        \
+      fn(float);                                                        \
+    } else if (scalar_dtype == at::ScalarType::Half) {                  \
+      fn(uint16_t);                                                     \
+    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
+      fn(__nv_bfloat16);                                                \
+    } else {                                                            \
+      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
+    }                                                                   \
+  }
+
+#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
+  {                                                                         \
+    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
+        <<<grid, block, 0, stream>>>(                                       \
+            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
+            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
+            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
+            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
+            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
+            num_heads, head_size);                                          \
+  }
+
+/*@brief Merges the attention states from prefix and suffix
+ * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
+ *
+ * @param output [n,h,d] The output tensor to store the merged attention states.
+ * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
+ * @param prefix_output [n,h,d] The prefix attention states.
+ * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
+ * states.
+ * @param suffix_output [n,h,d] The suffix attention states.
+ * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
+ * states.
+ */
+template <typename scalar_t>
+void merge_attn_states_launcher(torch::Tensor& output,
+                                std::optional<torch::Tensor> output_lse,
+                                const torch::Tensor& prefix_output,
+                                const torch::Tensor& prefix_lse,
+                                const torch::Tensor& suffix_output,
+                                const torch::Tensor& suffix_lse) {
+  constexpr uint NUM_THREADS = 128;
+  const uint num_tokens = output.size(0);
+  const uint num_heads = output.size(1);
+  const uint head_size = output.size(2);
+  const uint pack_size = 16 / sizeof(scalar_t);
+  TORCH_CHECK(head_size % pack_size == 0,
+              "headsize must be multiple of pack_size:", pack_size);
+  float* output_lse_ptr = nullptr;
+  if (output_lse.has_value()) {
+    output_lse_ptr = output_lse.value().data_ptr<float>();
+  }
+  // Process one pack elements per thread. for float, the
+  // pack_size is 4 for half/bf16, the pack_size is 8.
+  const uint threads_per_head = head_size / pack_size;
+  const uint total_threads = num_tokens * num_heads * threads_per_head;
+
+  dim3 block(NUM_THREADS);
+  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
+
+  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
+}
+
+#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
+  {                                                                         \
+    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
+                                         prefix_lse, suffix_output,         \
+                                         suffix_lse);                       \
+  }
+
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse) {
+  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
+}
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 4568699b30773..cf67847b45ba0 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit FP16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit FP16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit FP16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit BF16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit BF16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
 
+  // normal load
   explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
+  // non-temproal load
+  explicit FP32Vec16(bool, void* ptr)
+      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   explicit FP32Vec16(const FP32Vec4& data)
@@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
     _mm_mask_storeu_epi8(ptr, mask, reg);
   }
 };
+
+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    __m512i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  // normal load
+  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
+
+  // non-temproal load
+  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
+
+  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
+    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
+    _mm512_mask_storeu_epi8(ptr, mask, reg);
+  }
+
+  // non-temproal save
+  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
+};
 #endif
 
 template <typename T>
@@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 
 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
+#ifdef __AVX512F__
+inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
+  _mm512_stream_si512((__m512i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
+  _mm512_stream_ps((float*)ptr, vec.reg);
+}
+#endif
+
+inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
new file mode 100644
index 0000000000000..f55e96de251d0
--- /dev/null
+++ b/csrc/cpu/shm.cpp
@@ -0,0 +1,781 @@
+#include "cpu/cpu_types.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+#define MAX_SHM_RANK_NUM 8
+#define MAX_THREAD_NUM 12
+#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
+#define MAX_P2P_SEND_TENSOR_NUM 8
+
+template <typename scalar_t>
+struct KernelVecType {
+  using scalar_vec_t = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using scalar_vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using scalar_vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+  using scalar_vec_t = vec_op::FP16Vec16;
+};
+
+enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
+
+struct ThreadSHMContext {
+  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
+  int thread_id;
+  int thread_num;
+  int rank;
+  int group_size;
+  size_t _spinning_count;
+  int swizzled_ranks[MAX_SHM_RANK_NUM];
+  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
+  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+
+  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
+                   const int group_size, void* thread_shm_ptr)
+      : thread_id(thread_id),
+        thread_num(thread_num),
+        rank(rank),
+        group_size(group_size),
+        _spinning_count(0) {
+    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
+    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
+    TORCH_CHECK((size_t)this % 64 == 0);
+    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      shm_contexts[i] = nullptr;
+      thread_shm_ptrs[i] = nullptr;
+      swizzled_ranks[i] = (i + rank) % group_size;
+      thread_stats[i] = ThreadSHMStat::DONE;
+    }
+    set_context(rank, this, thread_shm_ptr);
+  }
+
+  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
+    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
+    TORCH_CHECK(ptr);
+    TORCH_CHECK(thread_shm_ptr);
+    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
+    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
+    shm_contexts[rank] = ptr;
+    thread_shm_ptrs[rank] = thread_shm_ptr;
+  }
+
+  template <typename T>
+  T* get_thread_shm_ptr(int rank) {
+    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
+  }
+
+  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
+
+  void wait_for_all(ThreadSHMStat prev_stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      while (thread_stats[rank] == prev_stat) {
+        ++_spinning_count;
+        _mm_pause();
+      }
+    }
+    vec_op::mem_barrier();
+  }
+
+  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
+    while (thread_stats[rank] == prev_stat) {
+      ++_spinning_count;
+      _mm_pause();
+    }
+    vec_op::mem_barrier();
+  }
+
+  void set_thread_stat(ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[this->rank] = stat;
+    }
+  }
+
+  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[target_rank] = stat;
+    }
+  }
+
+  // barrier for all ranks in the group, used for all2all ops
+  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
+  void barrier(ThreadSHMStat next_stat) {
+    if (next_stat == ThreadSHMStat::THREAD_READY) {
+      set_thread_stat(ThreadSHMStat::THREAD_READY);
+      wait_for_all(ThreadSHMStat::DONE);
+    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
+      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
+      wait_for_all(ThreadSHMStat::THREAD_READY);
+    } else if (next_stat == ThreadSHMStat::DONE) {
+      set_thread_stat(ThreadSHMStat::DONE);
+      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
+    } else {
+      TORCH_CHECK(false, "Invalid next_stat to barrier.");
+    }
+  }
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "SHMContext:";
+    ss << "\nrank: " << rank;
+    ss << "\ngroup_size: " << group_size;
+    ss << "\nthread_num: " << thread_num;
+    ss << "\nthread_id: " << thread_id;
+
+    ss << "\nshm_ctx_stat_loop_seq: [";
+    for (int i = 0; i < group_size; ++i) {
+      ss << swizzled_ranks[i] << ", ";
+    }
+    ss << "]";
+
+    ss << "\nshm_contexts: [";
+    for (int i = 0; i < group_size; ++i) {
+      if (shm_contexts[i]) {
+        ss << shm_contexts[i]->rank << ", ";
+      }
+    }
+    ss << "]";
+
+    return ss.str();
+  }
+};
+
+class SHMManager {
+ public:
+  explicit SHMManager(const std::string& name, const int rank,
+                      const int group_size)
+      : _rank(rank),
+        _group_size(group_size),
+        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
+        _shm_names({""}),
+        _shared_mem_ptrs({nullptr}),
+        _shm_ctx(nullptr) {
+    _shm_names[rank] = get_shm_name(name, rank);
+    _shared_mem_ptrs[rank] = init_shm(rank);
+    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
+
+    for (int i = 0; i < _thread_num; ++i) {
+      ThreadSHMContext* ctx = new (_shm_ctx + i)
+          ThreadSHMContext(i, _thread_num, _rank, _group_size,
+                           compute_thread_shm_ptr(_shm_ctx, i));
+    }
+  }
+
+  void join(const std::string& name) {
+    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
+      if (rank_idx != _rank) {
+        TORCH_CHECK(_shm_names[rank_idx].empty());
+        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
+        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
+        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
+        ThreadSHMContext* target_ctx =
+            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
+        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
+          _shm_ctx[thread_idx].set_context(
+              rank_idx, target_ctx + thread_idx,
+              compute_thread_shm_ptr(target_ctx, thread_idx));
+        }
+      }
+    }
+  }
+
+  ~SHMManager() { destroy_shm(); }
+
+  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
+
+  static std::string get_shm_name(const std::string& name, int rank) {
+    return name + "_" + std::to_string(rank);
+  }
+
+  static int64_t create_singleton_instance(const std::string& name,
+                                           const int group_size,
+                                           const int rank) {
+    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
+    SingletonInstances.emplace_back(
+        std::make_unique<SHMManager>(name, rank, group_size));
+    return static_cast<int64_t>(SingletonInstances.size() - 1);
+  }
+
+  static SHMManager* get_singleton_instance(int64_t handle) {
+    return SingletonInstances[handle].get();
+  }
+
+ protected:
+  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
+  static std::mutex SingletonInstancesLock;
+
+ private:
+  static size_t round_to_alignment(size_t num) {
+    return ((num + 63) / 64) * 64;
+  }
+
+  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
+    int8_t* thread_shm_ptr =
+        reinterpret_cast<int8_t*>(ctx) +
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    return thread_shm_ptr +
+           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
+  }
+
+  size_t compute_shm_size() {
+    const size_t rounded_rank_buffer_size =
+        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
+    const size_t rounded_thread_shm_ctx_size =
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    const size_t shm_size =
+        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
+    return shm_size;
+  }
+
+  void* init_shm(int target_rank) {
+    const std::string& shm_name = _shm_names[target_rank];
+    const int local_rank = _rank;
+    const size_t shm_size = compute_shm_size();
+
+    int fd = -1;
+    if (local_rank == target_rank) {
+      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
+                    S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+
+      if (ftruncate(fd, shm_size) == -1)
+        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    } else {
+      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    }
+
+    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_POPULATE, fd, 0);
+
+    if (shm_ptr == MAP_FAILED) {
+      TORCH_CHECK(false,
+                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    if (close(fd) != 0) {
+      TORCH_CHECK(
+          false, "close in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
+
+    return shm_ptr;
+  }
+
+  void destroy_shm() {
+    std::stringstream ss;
+    ss << "local rank " << _rank << ": [";
+    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
+      ss << _shm_ctx[thread_id]._spinning_count << ", ";
+    }
+    ss << "]\n";
+
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      if (_shared_mem_ptrs[i] != nullptr) {
+        munmap(_shared_mem_ptrs[i], compute_shm_size());
+      }
+
+      if (!_shm_names[i].empty()) {
+        shm_unlink(_shm_names[i].c_str());
+      }
+    }
+  }
+
+  int _rank;
+  int _group_size;
+  int _thread_num;
+  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
+  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
+  ThreadSHMContext* _shm_ctx;
+};
+
+namespace shm_cc_ops {
+template <typename scalar_t, typename F>
+void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
+  int thread_num = ctx->thread_num;
+  int64_t total_bytes = elem_num * sizeof(scalar_t);
+  int64_t total_units_num =
+      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
+  int64_t per_thread_units_num =
+      (total_units_num + thread_num - 1) / thread_num;
+  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
+  int64_t max_per_thread_iteration_elem_num =
+      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
+  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int i = 0; i < thread_num; ++i) {
+    int64_t offset = i * per_thread_elem_num;
+    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
+    int64_t curr_elem_num =
+        std::min(max_per_thread_iteration_elem_num, end - offset);
+    ThreadSHMContext* thread_ctx = ctx + i;
+
+    while (curr_elem_num > 0) {
+      inner_func(thread_ctx, offset, curr_elem_num);
+
+      offset += max_per_thread_iteration_elem_num;
+      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
+    }
+  }
+}
+};  // namespace shm_cc_ops
+
+namespace shm_cc_ops {
+
+void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data(
+        true, (int8_t*)src + i);  // stream loading shm to avoid caching
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
+#pragma GCC unroll 4
+  for (int64_t i = 0; i < bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.nt_save((int8_t*)dst + i);
+  }
+}
+
+void memcpy(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+template <typename scalar_t, int RANKS>
+void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
+                         size_t elem_num) {
+  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
+  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
+  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
+  const int worldsize = ctx->group_size;
+
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+        scalar_t* thread_data_ptr = data + data_offset;
+        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
+
+        scalar_t* remote_data_ptrs[RANKS - 1];
+        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
+              thread_ctx->get_swizzled_rank(idx + 1));
+        });
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
+                                  thread_data_elem_num);
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t aligned_data_elem_num =
+            (data_elem_num / vec_elem_num) * vec_elem_num;
+        int64_t i = 0;
+#pragma GCC unroll 4
+        for (; i < aligned_data_elem_num; i += vec_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i);
+        }
+
+        if (i < data_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i,
+                            data_elem_num - aligned_data_elem_num);
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+};  // namespace shm_cc_ops
+
+std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
+std::mutex SHMManager::SingletonInstancesLock = {};
+
+template <typename scalar_t>
+void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
+  switch (ctx->group_size) {
+    case 2:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
+      break;
+    case 3:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
+      break;
+    case 4:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
+      break;
+    case 8:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Invalid world size: " + std::to_string(ctx->group_size));
+  }
+}
+
+template <typename scalar_t>
+void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
+                     scalar_t** outputs, const int dst) {
+  CPU_KERNEL_GUARD_IN(shm_gather_impl)
+  const int worldsize = ctx->group_size;
+  TORCH_CHECK_LT(dst, worldsize);
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
+                                  data_elem_num * sizeof(scalar_t));
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        if (rank == dst) {
+          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
+                             data_elem_num * sizeof(scalar_t));
+          for (int i = 1; i < worldsize; ++i) {
+            int src_rank = thread_ctx->get_swizzled_rank(i);
+            scalar_t* src_ptr =
+                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
+            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
+            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
+                                        data_elem_num * sizeof(scalar_t));
+          }
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+
+struct MemPiece {
+  void* ptr;
+  int64_t size;
+
+  template <typename T>
+  T* data_ptr() {
+    return reinterpret_cast<T*>(ptr);
+  }
+};
+
+struct TensorListMeta {
+  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
+  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
+  int64_t tensor_num;
+  int64_t total_bytes;
+
+  TensorListMeta() : tensor_num(0), total_bytes(0) {
+    static_assert(sizeof(TensorListMeta) % 64 == 0);
+    static_assert(sizeof(TensorListMeta) <
+                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
+                                             // hold by the thread 0
+    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
+      tensor_bytes[i] = 0;
+      tensor_ptrs[i] = nullptr;
+      tensor_types[i] = torch::ScalarType::Undefined;
+    }
+  }
+
+  // For send and recv
+  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
+    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
+                "Re-bind TensorListMeta is not allowed.")
+    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
+    tensor_num = tensor_list.size();
+    int64_t bytes_sum = 0;
+    for (int i = 0; i < tensor_list.size(); ++i) {
+      torch::Tensor& t = tensor_list[i];
+      TORCH_CHECK(t.is_contiguous());
+      tensor_bytes[i] = t.nbytes();
+      tensor_types[i] = t.scalar_type();
+      tensor_ptrs[i] = t.data_ptr();
+      bytes_sum += t.nbytes();
+    }
+    total_bytes = bytes_sum;
+  }
+
+  // For recv
+  std::vector<torch::Tensor> generate_tensor_list() {
+    std::vector<torch::Tensor> tensor_list;
+    tensor_list.reserve(tensor_num);
+
+    for (int i = 0; i < tensor_num; ++i) {
+      int64_t bytes = tensor_bytes[i];
+      auto type = tensor_types[i];
+      int64_t elem_bytes = torch::elementSize(type);
+
+      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
+      int64_t elem_num = bytes / elem_bytes;
+      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
+      tensor_list.emplace_back(torch::empty({elem_num}, options));
+    }
+    return tensor_list;
+  }
+
+  MemPiece get_data(int64_t offset) {
+    for (int i = 0; i < tensor_num; ++i) {
+      if (offset < tensor_bytes[i]) {
+        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
+                tensor_bytes[i] - offset};
+      }
+      offset -= tensor_bytes[i];
+    }
+    return {nullptr, 0};
+  }
+
+ private:
+  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
+  int8_t _padding[40];
+};
+
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
+                               const std::vector<torch::Tensor>& tensor_list) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
+  std::vector<torch::Tensor> tensor_list_with_metadata;
+  tensor_list_with_metadata.reserve(1 + tensor_list.size());
+
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  tensor_list_with_metadata.emplace_back(
+      torch::empty({sizeof(TensorListMeta)}, options));
+  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
+                                   tensor_list.begin(), tensor_list.end());
+
+  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
+  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
+
+  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
+  metadata->bind_tensor_list(tensor_list_with_metadata);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata->total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        // Wait until the receiver set the stat to DONE
+        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
+              frag.ptr, frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
+      });
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
+                                                     int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  torch::Tensor metadata_tensor =
+      torch::empty({sizeof(TensorListMeta)}, options);
+
+  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
+  ctx->wait_for_one(src, ThreadSHMStat::DONE);
+  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
+                     ctx->get_thread_shm_ptr<void>(src),
+                     sizeof(TensorListMeta));
+  TensorListMeta* src_metadata =
+      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
+  std::vector<torch::Tensor> tensor_list_with_metadata =
+      src_metadata->generate_tensor_list();
+
+  TensorListMeta metadata;
+  metadata.bind_tensor_list(tensor_list_with_metadata);
+  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
+  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata.total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        // Wait until the sender set the stat to SHM_DATA_READY
+        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              frag.ptr,
+              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
+              frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
+      });
+
+  std::vector<torch::Tensor> tensor_list;
+  tensor_list.reserve(metadata.tensor_num - 1);
+  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
+                     tensor_list_with_metadata.end());
+
+  return tensor_list;
+}
+}  // namespace
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_gather_impl)
+
+    if (outputs.has_value()) {
+      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
+      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+      for (int i = 0; i < outputs->size(); ++i) {
+        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
+      }
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                      dst);
+    } else {
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
+                      dst);
+    }
+
+    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
+  });
+}
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output) {
+  TORCH_CHECK(data.is_contiguous())
+  TORCH_CHECK(output.is_contiguous())
+
+  const int64_t input_elem_num = data.numel();
+  const int64_t output_elem_num = output.numel();
+  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
+  const int world_size = output_elem_num / input_elem_num;
+
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
+    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
+    TORCH_CHECK_EQ(ctx->group_size, world_size);
+
+    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+    for (int i = 0; i < world_size; ++i) {
+      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
+    }
+    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                    ctx->rank);
+    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
+  });
+}
+
+void shm_allreduce(int64_t handle, torch::Tensor& data) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
+    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
+    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel());
+    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
+  });
+}
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
+  shm_send_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
+  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
+  auto tensor_list = shm_recv_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
+  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
+  return tensor_list;
+}
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank) {
+  return SHMManager::create_singleton_instance(name, group_size, rank);
+}
+
+std::string join_shm_manager(int64_t handle, const std::string& name) {
+  auto shm_manager = SHMManager::get_singleton_instance(handle);
+  TORCH_CHECK(shm_manager);
+  shm_manager->join(name);
+  return shm_manager->get_shm_ctx()->to_string();
+}
\ No newline at end of file
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index ef5a2fb5c4d22..7ae7e3386b4ed 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
 
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank);
+
+std::string join_shm_manager(int64_t handle, const std::string& name);
+
+void shm_allreduce(int64_t handle, torch::Tensor& data);
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst);
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output);
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst);
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
+
+// SHM CCL
+#ifdef __AVX512F__
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
+          &init_shm_manager);
+  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
+  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
+  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  ops.def(
+      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
+      "()");
+  ops.impl("shm_gather", torch::kCPU, &shm_gather);
+  ops.def(
+      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
+      "()");
+  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
+  ops.def(
+      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
+      "()");
+  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
+  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
+          &shm_recv_tensor_list);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 42a1c1d924bac..c17a8961629a6 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -4,6 +4,11 @@
   #include <string>
   #include <sched.h>
 #endif
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
+  #include <unistd.h>
+  #include <sys/syscall.h>
+  #define gettid() syscall(SYS_gettid)
+#endif
 
 #include "cpu_types.hpp"
 
@@ -18,7 +23,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
   std::vector<int> omp_cpu_ids;
   omp_cpu_ids.reserve(omp_cpu_mask->size);
diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
new file mode 100644
index 0000000000000..938bd4ab7fc62
--- /dev/null
+++ b/csrc/cuda_view.cu
@@ -0,0 +1,39 @@
+#include <torch/all.h>
+#include <torch/cuda.h>
+#include <cuda_runtime.h>
+
+// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
+// memory, and that UVA (Unified Virtual Addressing) is enabled.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
+  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
+
+  // Get raw host pointer from CPU tensor
+  void* host_ptr = cpu_tensor.data_ptr();
+
+  // Get a device pointer corresponding to the pinned host memory
+  void* device_ptr = nullptr;
+  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+  TORCH_CHECK(err == cudaSuccess,
+              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+
+  // We'll use the same sizes, strides, and dtype as the CPU tensor.
+  // TODO: check if layout is respected.
+  auto sizes = cpu_tensor.sizes();
+  auto strides = cpu_tensor.strides();
+  auto options = cpu_tensor.options().device(torch::kCUDA);
+
+  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
+  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
+  // memory, so we don't free it here.
+  auto deleter = [](void*) {
+    // no-op, since the memory is owned by the original CPU tensor
+  };
+
+  torch::Tensor cuda_tensor =
+      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+
+  TORCH_CHECK(cuda_tensor.device().is_cuda(),
+              "Resulting tensor is not on CUDA device");
+
+  return cuda_tensor;
+}
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 123278bfed71d..a38d6fa24a28e 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
 
 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                       torch::Tensor& rank_data, int64_t rank,
-                      bool full_nvlink) {
+                      bool fully_connected) {
   int world_size = fake_ipc_ptrs.size();
   if (world_size > 8)
     throw std::invalid_argument("world size > 8 is not supported");
@@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
   }
   return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                             rank_data.numel(), rank, world_size,
-                                            full_nvlink);
+                                            fully_connected);
 }
 
 /**
@@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
   bytes.reserve(handles.size());
   fa->register_graph_buffers(bytes, offsets);
 }
+
+std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+  auto device_index = c10::cuda::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+#if defined(USE_ROCM)
+  // data buffers need to be "uncached" for signal on MI200
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+#else
+  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
+#endif
+  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handle =
+      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
+  AT_CUDA_CHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+fptr_t open_mem_handle(torch::Tensor& mem_handle) {
+  void* ipc_ptr;
+  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index b9df4ed160b03..44709b4597765 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -5,6 +5,10 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
+#if defined(USE_ROCM)
+typedef __hip_bfloat16 nv_bfloat16;
+#endif
+
 #include <iostream>
 #include <array>
 #include <limits>
@@ -12,6 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
+namespace vllm {
 #define CUDACHECK(cmd)                                              \
   do {                                                              \
     cudaError_t e = cmd;                                            \
@@ -22,24 +27,37 @@
     }                                                               \
   } while (0)
 
-namespace vllm {
-
+// Maximal number of blocks in allreduce kernel.
 constexpr int kMaxBlocks = 36;
+
+// Default number of blocks in allreduce kernel.
+#ifndef USE_ROCM
+const int defaultBlockLimit = 36;
+CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#else
+const int defaultBlockLimit = 16;
+hipPointer_attribute rangeStartAddrAttr =
+    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#endif
+
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
+
+// Two sets of peer counters are needed for two syncs: starting and ending an
+// operation. The reason is that it's possible for peer GPU block to arrive at
+// the second sync point while the current GPU block haven't passed the first
+// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
+// waiting for counter. We use alternating counter array to avoid this
+// possibility.
 struct Signal {
-  alignas(128) FlagType self_counter[kMaxBlocks][8];
-  // Two sets of peer counters are needed for two syncs. The reason is that
-  // it's possible for peer GPU block to arrive at the second sync point while
-  // the current GPU block haven't passed the first sync point. Thus, peer GPU
-  // may write counter+1 while current GPU is busy waiting for counter. We use
-  // alternating counter array to avoid this possibility.
-  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+  alignas(128) FlagType start[kMaxBlocks][8];
+  alignas(128) FlagType end[kMaxBlocks][8];
+  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
 };
 
 struct __align__(16) RankData {
-  const void* __restrict__ ptrs[8];
+  const void* ptrs[8];
 };
 
 struct __align__(16) RankSignals {
@@ -134,27 +152,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
   }
 }
 
+#if !defined(USE_ROCM)
+
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#else
+  #else
   asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#endif
+  #endif
 }
 
 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
   FlagType flag;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
                : "=r"(flag)
                : "l"(flag_addr));
-#else
+  #else
   asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
                : "=r"(flag)
                : "l"(flag_addr));
-#endif
+  #endif
   return flag;
 }
 
@@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
   return flag;
 }
 
-// is_start: whether this is the very first synchronization barrier.
-// need_fence: whether a memory fence is needed. If true, a release-acquire
-// semantic is used to enforce memory access order before and after this
-// barrier.
-template <int ngpus, bool is_start, bool need_fence = false>
-DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
-                               int rank) {
-  if constexpr (!is_start) __syncthreads();
-  static_assert(
-      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
-    // Increment the counter. Technically we only need one counter, but we use
-    // multiple per block to eliminate the need to share the counter via smem.
-    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value
+    // from peer.
+    st_flag_volatile(peer_counter_ptr, flag);
+    while (ld_flag_volatile(self_counter_ptr) != flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+// This function is meant to be used as the second or the final
+// synchronization barrier in the all reduce kernel. If it's the final
+// synchronization barrier, we don't need to make any visibility guarantees
+// for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
     // Write the expected counter value to peer and wait for correct value from
     // peer.
-    auto peer_counter_ptr =
-        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
-    auto self_counter_ptr =
-        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
-    if constexpr (need_fence) {
-      st_flag_release(peer_counter_ptr, val);
-      while (ld_flag_acquire(self_counter_ptr) != val);
+    if constexpr (!final_sync) {
+      st_flag_release(peer_counter_ptr, flag);
+      while (ld_flag_acquire(self_counter_ptr) != flag);
     } else {
-      st_flag_volatile(peer_counter_ptr, val);
-      while (ld_flag_volatile(self_counter_ptr) != val);
+      st_flag_volatile(peer_counter_ptr, flag);
+      while (ld_flag_volatile(self_counter_ptr) != flag);
     }
   }
-  if constexpr (is_start || need_fence) __syncthreads();
+  if constexpr (!final_sync) __syncthreads();
+
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
 }
 
+#else
+
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                  __ATOMIC_RELAXED,
+                                  __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                            flag,
+                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                            __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (
+        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                               __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  if constexpr (!final_sync) __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#endif
+
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
   A tmp = upcast(ptrs[0][idx]);
@@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1)
   // note: we don't reorder the address so the accumulation order is the same
   // for all ranks, ensuring bitwise identical results
   auto dp = *_dp;
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
   // do the actual reduction
   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
     ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
   }
-  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
 }
 
 template <typename P>
@@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1)
     tmps[i] = get_tmp_buf<P>(sg.signals[target]);
   }
   auto tmp_out = tmps[0];
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+
   // stage 1: reduce scatter
   for (int idx = start + tid; idx < end; idx += stride) {
     tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
   }
-  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+  barrier_at_end<ngpus>(sg, self_sg, rank);
 
   // stage 2: allgather. Note: it's important to match the tid between
   // the two stages, because visibility across devices is only guaranteed
   // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from all
-  // ranks.
+  // start + i in the first stage, then thread i also gathers start + i from
+  // all ranks.
+
   for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
     for (int i = 0; i < ngpus; i++) {
@@ -287,21 +371,22 @@ class CustomAllreduce {
  public:
   int rank_;
   int world_size_;
-  bool full_nvlink_;
+  // Full NVLink or xGMI connection between GPUs.
+  bool fully_connected_;
 
   RankSignals sg_;
-  // Stores an map from a pointer to its peer pointters from all ranks.
+  // Stores a map from a pointer to its peer pointers from all ranks.
   std::unordered_map<void*, RankData*> buffers_;
   Signal* self_sg_;
 
   // Stores rank data from all ranks. This is mainly for cuda graph purposes.
   // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph capture
-  // time. Therefore, during capture, we increment the rank data pointer and use
-  // that as the argument to the kernel. The kernel arguments are stored in
-  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
-  // memory pointed to by the pointers in graph_unreg_buffers_ when
-  // the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph
+  // capture time. Therefore, during capture, we increment the rank data
+  // pointer and use that as the argument to the kernel. The kernel arguments
+  // are stored in graph_unreg_buffers_. The actual peer pointers will be
+  // filled in at the memory pointed to by the pointers in
+  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
   //
   // The overall process looks like this:
   // 1. Graph capture.
@@ -319,17 +404,18 @@ class CustomAllreduce {
    * Signals are an array of ipc-enabled buffers from all ranks.
    * For each of the buffer, the layout is as follows:
    * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second section
-   * is for storing the intermediate results required by some allreduce algos.
+   * The first section is for allreduce synchronization, and the second
+   * section is for storing the intermediate results required by some
+   * allreduce algos.
    *
    * Note: this class does not own any device memory. Any required buffers
    * are passed in from the constructor.
    */
   CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool full_nvlink = true)
+                  int rank, int world_size, bool fully_connected = true)
       : rank_(rank),
         world_size_(world_size),
-        full_nvlink_(full_nvlink),
+        fully_connected_(fully_connected),
         self_sg_(signals[rank]),
         d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
         d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@@ -361,8 +447,7 @@ class CustomAllreduce {
       void* base_ptr;
       // note: must share the base address of each allocation, or we get wrong
       // address
-      if (cuPointerGetAttribute(&base_ptr,
-                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
                                 (CUdeviceptr)ptr) != CUDA_SUCCESS)
         throw std::runtime_error("failed to get pointer attr");
       CUDACHECK(cudaIpcGetMemHandle(
@@ -396,11 +481,11 @@ class CustomAllreduce {
 
   // Note: when registering graph buffers, we intentionally choose to not
   // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the remote
-  // possibility of different allocation patterns between ranks. For example,
-  // rank 1 may get the same input address for the second allreduce, but rank 2
-  // got a different address. IPC handles have internal reference counting
-  // mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the
+  // remote possibility of different allocation patterns between ranks. For
+  // example, rank 1 may get the same input address for the second allreduce,
+  // but rank 2 got a different address. IPC handles have internal reference
+  // counting mechanism so overhead should be small.
   void register_graph_buffers(
       const std::vector<std::string>& handles,
       const std::vector<std::vector<int64_t>>& offsets) {
@@ -431,15 +516,15 @@ class CustomAllreduce {
   /**
    * Performs allreduce, assuming input has already been registered.
    *
-   * Block and grid default configs are results after careful grid search. Using
-   * 36 blocks give the best or close to the best runtime on the devices I
-   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
-   * take a small amount of SMs. Not quite sure the underlying reason, but my
-   * guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search.
+   * Using 36 blocks give the best or close to the best runtime on the devices
+   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
+   * only take a small amount of SMs. Not quite sure the underlying reason,
+   * but my guess is that too many SMs will cause contention on NVLink bus.
    */
   template <typename T>
   void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = 36) {
+                 int threads = 512, int block_limit = defaultBlockLimit) {
     auto d = packed_t<T>::P::size;
     if (size % d != 0)
       throw std::runtime_error(
@@ -473,13 +558,11 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                  rank_, size);
-    // TODO(hanzhi713): Threshold is different for A100 and H100.
-    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
   case ngpus: {                                       \
     if (world_size_ == 2) {                           \
       KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (full_nvlink_) {                        \
+    } else if (fully_connected_) {                    \
       if ((world_size_ <= 4 && bytes < 512 * 1024) || \
           (world_size_ <= 8 && bytes < 256 * 1024)) { \
         KL(ngpus, cross_device_reduce_1stage);        \
@@ -497,7 +580,8 @@ class CustomAllreduce {
       REDUCE_CASE(8)
       default:
         throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
+            "num "
             "gpus = " +
             std::to_string(world_size_));
     }
@@ -511,10 +595,11 @@ class CustomAllreduce {
     }
   }
 };
+
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
- a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and
+ add a template instantiation:
  * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
  half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
index b59ea40d980f4..f7f0823465d30 100644
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -1,9 +1,9 @@
 /**
  * This is a standalone test for custom allreduce.
  * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=xxx
+ * export MPI_HOME=XXX
  * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
  *
  * Warning: this C++ test is not designed to be very readable and was used
  * during the rapid prototyping process.
@@ -22,7 +22,15 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#include "nccl.h"
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+  #include "rccl/rccl.h"
+  #include "custom_all_reduce_hip.cuh"
+#else
+  #include "nccl.h"
+  #include "custom_all_reduce.cuh"
+#endif
 
 #define MPICHECK(cmd)                                                  \
   do {                                                                 \
@@ -43,16 +51,29 @@
     }                                                               \
   } while (0)
 
+#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) {
+    uint64_t start = wall_clock64();
+    uint64_t cycles_elapsed;
+    do {
+      cycles_elapsed = wall_clock64() - start;
+    } while (cycles_elapsed < 100);
+  }
   for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
 #else
+__global__ void dummy_kernel() {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+  #else
   for (int i = 0; i < 100; i++) {
     long long int start = clock64();
     while (clock64() - start < 150000000);  // approximately 98.4ms on P40
   }
-#endif
+  #endif
 }
+#endif
 
 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
    * registration, they are allocated and registered together in the test for
    * convenience.
    */
+#ifdef USE_ROCM
+  CUDACHECK(hipExtMallocWithFlags(
+      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
+      hipDeviceMallocUncached));
+#else
   CUDACHECK(
       cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+#endif
   CUDACHECK(
       cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
   CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@@ -311,13 +338,18 @@ int main(int argc, char** argv) {
 
   bool performance_test = true;
   cudaProfilerStart();
-  // Uncomment to scan through different block size configs.
-  // for (int threads : {256, 512, 1024}) {
-  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-  //     performance_test);
-  //   }
-  // }
+// Uncomment to scan through different block size configs.
+// for (int threads : {256, 512, 1024}) {
+//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+//     performance_test);
+//   }
+// }
+#ifdef USE_ROCM
+  const int block_limit = 16;
+#else
+  const int block_limit = 36;
+#endif
   // Scan through different sizes to test performance.
   for (int sz = 512; sz <= (8 << 20); sz *= 2) {
     run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@@ -326,4 +358,4 @@ int main(int argc, char** argv) {
   cudaProfilerStop();
   MPICHECK(MPI_Finalize());
   return EXIT_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index febc4eccd9561..dbe0e30f5cbfe 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -48,4 +48,14 @@ struct enable_sm90_or_later : Kernel {
     Kernel::operator()(std::forward<Args>(args)...);
 #endif
   }
-};
\ No newline at end of file
+};
+
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
new file mode 100644
index 0000000000000..5c1d6e3f46be0
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    const Element* const* ptr_row_array = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
+        int group, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , group(group)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    int group;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row_array[group]));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      l,
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    const Element* const* ptr_col_array = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      int group,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      group(group),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+    int group;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col_array[group]));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl,
+      l,
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 0a812dc56a994..62b848a0a9635 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
@@ -69,6 +70,16 @@ struct ScaledEpilogueBase {
       0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
       128 / sizeof_bits_v<T>, EnableNullPtr>;
 
+  template <typename T>
+  using ColOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
   // scalar cases.
@@ -96,6 +107,14 @@ struct ScaledEpilogueBase {
                   std::is_same_v<Descriptor, RowLoad<T, true>>);
     return Arguments{data_ptr};
   }
+
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
+    using Arguments = typename Descriptor::Arguments;
+    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
+    return Arguments{data_ptr, do_broadcast};
+  }
 };
 
 /*
@@ -381,4 +400,51 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
+/*
+    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
+    to arrays containing different scales used in group gemm. The number of
+   pointers in ScaleA and the number of pointers in ScaleB are equal to the
+   group size.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueArray
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  static ArgumentType prepare_args(float const* const* a_scales_ptr,
+                                   float const* const* b_scales_ptr,
+                                   bool a_col_broadcast, bool b_row_broadcast) {
+    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
+        a_scales_ptr, a_col_broadcast);
+    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
+        b_scales_ptr, b_row_broadcast);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
 };  // namespace vllm::c3x
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index f0e5533bcae60..98daf1a1b8e6c 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
         // in case the final state is separated between the last "smem_exchange" and 
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
-        // (which occurs when `final_state_position` is a non-positivie index)
+        // (which occurs when `final_state_position` is a non-positive index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
         if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
new file mode 100644
index 0000000000000..d1c0d92f6814a
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{'true' if has_act_order else 'false'}}, "
+            "{{'true' if has_zp else 'false'}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        has_zp = "B" not in scalar_type
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
+            has_act_order = group_blocks == 0
+            if has_zp and has_act_order:
+                continue
+            if thread_configs[2] == 256:
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            template_str = jinja2.Template(TEMPLATE).render(
+                scalar_t=c_dtype,
+                w_type_id=scalar_type + ".id()",
+                threads=threads,
+                thread_m_blocks=max(m_blocks, 1),
+                thread_n_blocks=n_blocks,
+                thread_k_blocks=k_blocks,
+                m_block_size_8=m_blocks == 0.5,
+                stages="pipe_stages",
+                has_act_order=has_act_order,
+                has_zp=has_zp,
+                group_blocks=group_blocks,
+                is_zp_float=False,
+            )
+
+            all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
new file mode 100644
index 0000000000000..3d92660e8028e
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -0,0 +1,44 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                   \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                       \
+      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \
+      const int *__restrict__ g_idx,                                        \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                     \
+      const int32_t *__restrict__ expert_ids_ptr,                           \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,               \
+      const float *__restrict__ topk_weights_ptr, int top_k,                \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,        \
+      int prob_n, int prob_k, int *locks, bool use_atomic_add,              \
+      bool use_fp32_reduce
+
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
new file mode 100644
index 0000000000000..205b308fe511b
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -0,0 +1,1917 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+                            const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile(
+        "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+        : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+        : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+                 : "=r"(a[0]), "=r"(a[1])
+                 : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                 : "=r"(a[0])
+                 : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t, int bit>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant(
+    int q, typename ScalarType<scalar_t>::FragB& frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 4>(
+    int q, typename ScalarType<half>::FragB& frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 4>(int q,
+                        typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC308C308;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  return frag_b;
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 8>(
+    int q, typename ScalarType<half>::FragB& frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 8>(int q,
+                        typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(
+    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::FragS& frag_s_1,
+                              typename ScalarType<scalar_t>::FragS& frag_s_2,
+                              typename ScalarType<scalar_t>::FragS& frag_s_3,
+                              typename ScalarType<scalar_t>::FragS& frag_s_4,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  extern __shared__ int4 sh[];
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+  const int group_size =
+      (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
+  const int scales_expert_stride = prob_n * prob_k / group_size / 8;
+  const int zp_expert_stride =
+      is_zp_float ? prob_n * prob_k / group_size / 8
+                  : prob_n * prob_k / group_size / (pack_factor * 4);
+
+  // parallel: num valid moe blocks
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int parallel = num_tokens_past_padded / moe_block_size;
+  int num_valid_blocks = parallel;
+  if (is_ep) {
+    for (int i = 0; i < parallel; i++) {
+      if (expert_ids_ptr[i] == -1) num_valid_blocks--;
+    }
+  }
+  int num_invalid_blocks = parallel - num_valid_blocks;
+  parallel = num_valid_blocks;
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  int par_id = 0;
+  int block_id = -1;
+  int64_t expert_id = 0;  // use int64 to avoid computation result overflow
+  int old_expert_id = 0;
+  int64_t B_expert_off = 0;
+
+  int4* sh_block_sorted_ids_int4 = sh;
+  int32_t* sh_block_sorted_ids =
+      reinterpret_cast<int*>(sh_block_sorted_ids_int4);
+  int4* sh_block_topk_weights_int4 =
+      sh_block_sorted_ids_int4 + moe_block_size / 4;
+  scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 4;
+
+  int32_t block_num_valid_tokens = 0;
+  int32_t locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // read moe block data given block_id
+  // block_sorted_ids / block_num_valid_tokens / block_topk_weights
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+  #pragma unroll
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
+  #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        if (sorted_token_ids[j] >= prob_m * top_k) {
+          block_num_valid_tokens = i * 4 + j;
+          break;
+        }
+      }
+      if (block_num_valid_tokens != moe_block_size) break;
+    }
+
+    __syncthreads();
+    int tid4 = threadIdx.x / 4;
+    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
+      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+
+      if (mul_topk_weights) {
+  #pragma unroll
+        for (int i = 0; i < 4; i++) {
+          sh_block_topk_weights[tid4 * 4 + i] =
+              Dtype::num2num2(Dtype::float2num(
+                  topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
+        }
+      }
+    }
+    __syncthreads();
+  };
+
+  // when move to next moe block, find the next block_id and expert_id
+  // and then read moe block data
+  auto update_next_moe_block_data = [&]() {
+    if (par_id >= parallel) return;
+
+    old_expert_id = expert_id;
+    if (num_invalid_blocks > 0) {
+      int skip_count = block_id == -1 ? par_id : 0;
+      block_id++;
+      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+        expert_id = expert_ids_ptr[i];
+        if (expert_id != -1) {
+          if (skip_count == 0) {
+            block_id = i;
+            break;
+          };
+          skip_count--;
+        };
+      }
+    } else {
+      block_id = par_id;
+      expert_id = expert_ids_ptr[block_id];
+    }
+
+    B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
+    scales_ptr += (expert_id - old_expert_id) * scales_expert_stride;
+    if constexpr (has_zp) {
+      zp_ptr += (expert_id - old_expert_id) * zp_expert_stride;
+    }
+    if constexpr (has_act_order) {
+      g_idx += (expert_id - old_expert_id) * prob_k;
+    }
+
+    read_moe_block_data(block_id);
+  };
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread =
+          div_ceil(block_num_valid_tokens, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[row];
+          int col = slice_col * 16 * thread_n_blocks / 8 +
+                    threadIdx.x % threads_per_m;
+          C[sorted_row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      slice_col = 0;
+      par_id++;
+      update_next_moe_block_data();
+    }
+  };
+
+  update_next_moe_block_data();
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+      (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 && (m_block_size_8 || has_zp))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh_new;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_b;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id,
+                                              int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  int a_remaining_load_count_in_slice = stages;
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+      if (prob_k > thread_k_blocks * 16 * stages || slice_col == 0 ||
+          a_remaining_load_count_in_slice > 0) {
+        a_remaining_load_count_in_slice--;
+  #pragma unroll
+        for (int i = 0; i < a_sh_wr_iters; i++) {
+          int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+          int row = a_idx / a_gl_stride;
+          int64_t sorted_row = 0;
+          if (!m_block_size_8 || row < 8)
+            sorted_row = sh_block_sorted_ids[row] / top_k;
+          int64_t true_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                         row < block_num_valid_tokens);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
+                    B_ptr[i] + j + B_expert_off);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+          frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp =
+        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        FragB frag_zp_0;
+        FragB frag_zp_1;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_0, frag_zp_0);
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_1, frag_zp_1);
+
+        frag_zp[0] = frag_zp_0[0];
+        frag_zp[1] = frag_zp_0[1];
+        frag_zp[2] = frag_zp_1[0];
+        frag_zp[3] = frag_zp_1[1];
+      }
+    }
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant<scalar_t, w_type.size_bits()>(b_quant_0, frag_b0);
+      dequant<scalar_t, w_type.size_bits()>(b_quant_1, frag_b1);
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k][2][j], act_frag_s[k2][3][j], 1);
+
+      } else if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (has_zp && !is_zp_float && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zp[j] = __hmul2(frag_zp[j],
+                               *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k % 2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k % 2][j][0].y, frag_zp[j].y);
+      } else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zpf[k2][j] = __hmul2(
+              frag_zpf[k2][j], *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j].x, frag_zpf[k2][j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j].y, frag_zpf[k2][j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+    if (!is_th_active) {
+      return;
+    }
+
+    int c_gl_stride = prob_n / 8;
+    int c_gl_wr_delta_o = 8 * c_gl_stride;
+    int c_gl_wr_delta_i = 4 * (active_threads / 32);
+    int c_gl_wr;
+    if constexpr (m_block_size_8) {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) +
+                (threadIdx.x % 32) / 8;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    } else {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    }
+    constexpr int c_sh_wr_delta = active_threads;
+    int c_sh_wr = threadIdx.x;
+
+    if (!first) {
+
+  #pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+        }
+      }
+    }
+
+  #pragma unroll
+    for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+      if (!first) {
+        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<float*>(
+              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+        }
+      }
+      if (!last) {
+        int4 c;
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<scalar_t*>(&c)[j] =
+              Dtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+        }
+
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          C[true_idx] = c;
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        sh_red[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) +
+                (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr =
+          (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4 && !has_zp) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      int row = c_gl_wr / c_gl_stride;
+      if (row < block_num_valid_tokens) {
+        int64_t sorted_row = sh_block_sorted_ids[row];
+        int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
+        scalar_t2 topk_weight_score;
+        if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
+        if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            scalar_t2 res = sh_red_half2[a];
+            if (mul_topk_weights) {
+              res = __hmul2(res, topk_weight_score);
+            }
+
+            if (use_atomic_add && slice_count > 1) {
+              atomicAdd(&C_half2[a], res);
+            } else {
+              C_half2[a] = res;
+            };
+          }
+        } else {
+          C[true_idx] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start],
+                                         g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          fetch_col_scale_to_shared();
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+    a_remaining_load_count_in_slice = 0;
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+  #pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(
+                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8 && !has_zp) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                  frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
+        wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      if (slice_row) a_remaining_load_count_in_slice = stages;
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
new file mode 100644
index 0000000000000..a16e955a325e2
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -0,0 +1,927 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {};
+
+}  // namespace marlin
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
+  int32_t block_sorted_ids[moe_block_size];
+  int block_num_valid_tokens = 0;
+  int64_t old_expert_id = 0;
+  int64_t expert_id = 0;
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      tmp_block_sorted_ids[i] =
+          ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+    }
+    for (int i = 0; i < moe_block_size; i++) {
+      if (block_sorted_ids[i] >= size_m * top_k) {
+        block_num_valid_tokens = i;
+        break;
+      };
+    }
+  };
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int in_offset = (row / top_k) * row_stride;
+    int out_offset = row * row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + in_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
+    old_expert_id = expert_id;
+    int tmp_expert_id = expert_ids_ptr[index];
+    if (tmp_expert_id == -1) continue;
+    expert_id = tmp_expert_id;
+    perm_int_ptr += (expert_id - old_expert_id) * size_k;
+    read_moe_block_data(index);
+
+    for (int i = 0; i < block_num_valid_tokens; i++)
+      permute_row(block_sorted_ids[i]);
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
+                          int prob_m, int prob_n, int prob_k, int num_bits,
+                          int group_size, bool has_act_order, bool is_k_full,
+                          int has_zp, int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+
+  // shm size for block_sorted_ids/block_topk_weights
+  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
+  int sh_block_meta_size = tb_m * 4 * 2;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int has_zp, int is_zp_float, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
+      has_act_order, is_k_full, has_zp, is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+  #define __GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+                   M_BLOCK_SIZE_8, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
+                   NUM_THREADS, IS_ZP_FLOAT)                                  \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&        \
+             thread_n_blocks == THREAD_N_BLOCKS &&                            \
+             thread_k_blocks == THREAD_K_BLOCKS &&                            \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                              \
+             has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&            \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&    \
+             is_zp_float == IS_ZP_FLOAT) {                                    \
+      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,    \
+                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,       \
+                      pipe_stages, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
+                      IS_ZP_FLOAT>;                                           \
+    }
+
+  #define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)              \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0,             \
+             NUM_THREADS, false)                                               \
+                                                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1,           \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8,            \
+             NUM_THREADS, false)
+
+  #define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)
+
+  #define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8,             \
+             NUM_THREADS, false)
+
+  #define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)                                    \
+                                                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)                                    \
+                                                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                  \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
+             true)                                                             \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
+                                int thread_m_blocks, int thread_n_blocks,
+                                int thread_k_blocks, bool m_block_size_8,
+                                bool has_act_order, bool has_zp,
+                                int group_blocks, int num_threads,
+                                bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256)
+  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256)
+  GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256)
+  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256)
+  GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128)
+
+  AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256)
+  AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128)
+
+  AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256)
+  AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128)
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+                                    int prob_n, int prob_k, int thread_m_blocks,
+                                    bool m_block_size_8, int num_bits,
+                                    int group_size, bool has_act_order,
+                                    bool is_k_full, bool has_zp,
+                                    bool is_zp_float, int max_shared_mem) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  int count = 0;
+  constexpr int device_max_reg_size = 255 * 1024;
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
+                         num_bits, group_size, has_act_order, is_k_full, has_zp,
+                         is_zp_float, max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
+        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type, thread_m_blocks, th_config.thread_n / 16,
+        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+        group_blocks, th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    if (thread_m_blocks > 1) {
+      exec_cfg = {1, th_config};
+      break;
+    } else {
+      cudaFuncAttributes attr;
+      cudaFuncGetAttributes(&attr, kernel);
+      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+      int allow_count = min(device_max_reg_size / reg_size,
+                            max_shared_mem / (cache_size + 1024));
+      allow_count = max(min(allow_count, 4), 1);
+      if (allow_count > count) {
+        count = allow_count;
+        exec_cfg = {count, th_config};
+      };
+    }
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* zp, void* g_idx, void* perm, void* a_tmp,
+               void* sorted_token_ids, void* expert_ids,
+               void* num_tokens_past_padded, void* topk_weights,
+               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
+               int prob_m, int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               bool is_zp_float) {
+  int thread_m_blocks = div_ceil(moe_block_size, 16);
+  bool m_block_size_8 = moe_block_size == 8;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
+        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
+  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
+  const int32_t* num_tokens_past_padded_ptr =
+      (const int32_t*)num_tokens_past_padded;
+  const float* topk_weights_ptr = (const float*)topk_weights;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    auto kernel = permute_cols_kernel<8>;
+    if (moe_block_size == 8) {
+    } else if (moe_block_size == 16)
+      kernel = permute_cols_kernel<16>;
+    else if (moe_block_size == 32)
+      kernel = permute_cols_kernel<32>;
+    else if (moe_block_size == 48)
+      kernel = permute_cols_kernel<48>;
+    else if (moe_block_size == 64)
+      kernel = permute_cols_kernel<64>;
+    else
+      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
+        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    prob_m = prob_m * top_k;
+    top_k = 1;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  thread_config_t thread_tfg;
+  if (thread_k != -1 && thread_n != -1) {
+    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+    exec_cfg = exec_config_t{1, thread_tfg};
+    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                " is not divisible by thread_n = ", thread_n);
+    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                " is not divisible by thread_k = ", thread_k);
+  } else {
+    // Auto config
+    exec_cfg = determine_exec_config<scalar_t>(
+        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        max_shared_mem);
+    thread_tfg = exec_cfg.tb_cfg;
+  }
+
+  int num_threads = thread_tfg.num_threads;
+  thread_k = thread_tfg.thread_k;
+  thread_n = thread_tfg.thread_n;
+  int blocks = sms * exec_cfg.blocks_per_sm;
+  if (exec_cfg.blocks_per_sm > 1)
+    max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  TORCH_CHECK(is_valid_config(thread_tfg, thread_m_blocks, prob_m, prob_n,
+                              prob_k, num_bits, group_size, has_act_order,
+                              is_k_full, has_zp, is_zp_float, max_shared_mem),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
+
+  auto kernel = get_marlin_kernel<scalar_t>(
+      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
+      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+
+  if (kernel == MarlinDefault) {
+    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                ", num_groups = ", num_groups, ", group_size = ", group_size,
+                ", thread_m_blocks = ", thread_m_blocks,
+                ", thread_n_blocks = ", thread_n_blocks,
+                ", thread_k_blocks = ", thread_k_blocks,
+                ", num_bits = ", num_bits);
+  }
+
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       max_shared_mem);
+  // avoid ">>>" being formatted to "> > >"
+  // clang-format off
+  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,
+      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
+      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce);
+  // clang-format on
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  if (moe_block_size != 8) {
+    TORCH_CHECK(moe_block_size % 16 == 0,
+                "unsupported moe_block_size=", moe_block_size);
+    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
+                "unsupported moe_block_size=", moe_block_size);
+  }
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(2) = ", b_q_weight.size(2),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m * top_k,
+                "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m * topk = ", size_m * top_k);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m * top_k, size_n}, options);
+  }
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce && !use_atomic_add) {
+    // max num of threadblocks is sms * 4
+    long max_c_tmp_size = min(
+        (long)size_n * sorted_token_ids.size(0),
+        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    if (moe_block_size == 8) max_c_tmp_size *= 2;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  ;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m * top_k, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4,
+        "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(2) == size_n,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(1),
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(1) == num_groups,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
+  int min_workspace_size = min(
+      max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
+        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else {
+    TORCH_CHECK(false,
+                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
+}
diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu
index 51ae76c1ec882..7b6a111c00adc 100644
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@@ -13,7 +13,6 @@
 template <typename scalar_t, int bit, int GROUPS>
 __global__ void moe_wna16_gemm_kernel(
     const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
-
     const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
     const uint32_t* __restrict__ qzeros,
 
@@ -54,8 +53,6 @@ __global__ void moe_wna16_gemm_kernel(
       if (token_index / top_k >= size_m) break;
 
       num_valid_tokens = m + 1;
-      if (blockIdx.z == 0 && offset_n < size_n)
-        output[token_index * size_n + offset_n] = Dtype::int2num(0);
 
       if (expert_id != -1) {
         int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
@@ -284,8 +281,7 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  auto options =
-      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+  output.zero_();
 
   const int num_experts = b_qweight.size(0);
   const int size_m = input.size(0);
@@ -302,9 +298,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
   const uint32_t* b_qzeros_ptr;
   if (b_qzeros.has_value())
     b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
-  const float* topk_weights_ptr;
+  const float* topk_weights_ptr = nullptr;
   if (topk_weights.has_value())
-    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>();
 
   int groups_per_block_row = BLOCK_SIZE_K / group_size;
   TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 718418e6cd497..d0de42251f97a 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -43,14 +43,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
 
   m.def(
-      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
-      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
-      "int b_q_type, SymInt size_m, "
-      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
-      "topk, "
-      "int moe_block_size, bool replicate_input, bool apply_weights)"
-      " -> Tensor");
+      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none,"
+      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
+      "Tensor sorted_token_ids,"
+      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
+      "Tensor! topk_weights, int moe_block_size, int top_k, "
+      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "int size_m, int size_n, int size_k,"
+      "bool is_full_k, bool use_atomic_add,"
+      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+
   // conditionally compiled so impl registration is in source file
 
 #endif
diff --git a/csrc/ops.h b/csrc/ops.h
index 7434aead57f0e..86039a26041ba 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,6 +52,15 @@ void paged_attention_v2(
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
+#ifndef USE_ROCM
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse);
+#endif
+
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
               double epsilon);
 
@@ -119,6 +128,8 @@ void advance_step_flashinfer(
     torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
     torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
@@ -143,7 +154,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
 #endif
 
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n);
+                              int64_t n,
+                              std::optional<at::ScalarType> const& dtype);
 
 torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                   int64_t type, int64_t row);
@@ -164,6 +176,7 @@ int64_t ggml_moe_get_block_size(int64_t type);
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
 
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
@@ -175,6 +188,19 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        std::optional<torch::Tensor> const& bias);
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
@@ -251,10 +277,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                        const std::optional<at::Tensor>& has_initial_state,
                        bool silu_activation, int64_t pad_slot_id);
 
-#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
-                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                 fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
@@ -265,4 +291,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                             const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
-#endif
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+int64_t open_mem_handle(torch::Tensor& mem_handle);
+void free_shared_buffer(int64_t buffer);
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
new file mode 100644
index 0000000000000..6c6e89790847f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator>
+__global__ void get_group_gemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k,
+    bool per_act_token, bool per_out_ch) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] =
+      a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] =
+      b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                    \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
+        <<<1, num_experts, 0, stream>>>(                                   \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),     \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),     \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                  \
+            static_cast<float*>(a_scales.data_ptr()),                      \
+            static_cast<float*>(b_scales.data_ptr()), out_tensors.size(1), \
+            a_tensors.size(1), per_act_token, per_out_ch);                 \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
new file mode 100644
index 0000000000000..2b8bc3fb0b261
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -0,0 +1,160 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (16, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_K8192 {
+  // K in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (m <= 16) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+void dispatch_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+}  // namespace
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                       expert_offsets, problem_sizes, a_strides, b_strides,
+                       c_strides);
+}
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
new file mode 100644
index 0000000000000..db827b7c5e186
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -0,0 +1,149 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
+
+using ElementAccumulator = float;
+using ArchTag = cutlass::arch::Sm90;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+template <typename ElementAB_, typename ElementC_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_group_gemm {
+  using ElementAB = ElementAB_;
+  using ElementC = void;
+  using ElementD = ElementC_;
+  using ElementAccumulator = float;
+
+  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
+
+  using StrideC =
+      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
+          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
+          Stages, KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_group_gemm_caller(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  int k_size = a_tensors.size(1);
+  int n_size = out_tensors.size(1);
+
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                            a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors,
+                            out_tensors, a_scales, b_scales);
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideC = typename GemmKernel::InternalStrideC;
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(
+          problem_sizes.data_ptr());
+  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides.data_ptr()),
+      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides.data_ptr())};
+
+  // Currently, we are only able to do broadcast on either all or none a_scales
+  // and on either all or none b_scales
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+          per_act_token, per_out_ch),
+      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
+      epilogue_args};
+
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
new file mode 100644
index 0000000000000..894727383a639
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -0,0 +1,103 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+
+__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+                                      int32_t* problem_sizes1,
+                                      int32_t* problem_sizes2,
+                                      int32_t* atomic_buffer,
+                                      const int topk_length, const int n,
+                                      const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = 2 * n;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = k;
+    problem_sizes2[expert_id * 3 + 2] = n;
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* atomic_buffer, const int num_experts) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+                                  const int32_t* __restrict__ expert_offsets,
+                                  int32_t* input_permutation,
+                                  int32_t* output_permutation,
+                                  int32_t* atomic_buffer, const int topk_length,
+                                  const int topk) {
+  int const blk_expert_id = blockIdx.x;
+  int const num_experts = gridDim.x;
+  int32_t const num_tokens = expert_offsets[num_experts];
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    int const expert_id = topk_ids[i];
+    if (expert_id == -1 && blockIdx.x == 0) {
+      // output_permutation is used to re-order the moe outputs. It is
+      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
+      // output of the cutlass kernels and c_map is the output_permutation.
+      // c2 is initialized to zeros, therefore by setting the output_permutation
+      // to num_tokens, we are guaranteed to fill the moe outputs to zero
+      // for "invalid" topk_ids.
+      output_permutation[i] = num_tokens;
+    } else if (expert_id == blk_expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
+  compute_expert_offsets<<<1, 1, 0, stream>>>(
+      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
+      topk_ids.size(1));
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index b08386459cbe2..54b63894e4cbc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -29,6 +29,20 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 #endif
 
 #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
@@ -102,6 +116,19 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
+  // CUTLASS groped FP8 kernels need at least CUDA 12.3
+  // and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability == 90) {
+    return CUDA_VERSION >= 12030;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -168,6 +195,46 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       version_num);
 }
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                      expert_offsets, problem_sizes, a_strides, b_strides,
+                      c_strides);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
+      ". Required capability: 90");
+}
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
+                                 problem_sizes2, input_permutation,
+                                 output_permutation, num_experts, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
+      "CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 8f9aa21aae4ee..eceb3a8ea05da 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -30,9 +30,6 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
     fp8_type* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
-  float const min_scaling_factor =
-      1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
-
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
 
@@ -67,8 +64,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
-                      min_scaling_factor);
+    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
+                      min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d331c63ae827f..def8b31b27546 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,20 +1,12 @@
 #pragma once
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 
 #include <cmath>
-#include <c10/core/ScalarType.h>
 
-#ifndef USE_ROCM
-  #include <c10/util/Float8_e4m3fn.h>
-  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
-#else
-  #include <ATen/hip/HIPContext.h>
-  #include <c10/util/Float8_e4m3fn.h>
-  #include <c10/util/Float8_e4m3fnuz.h>
+#ifdef USE_ROCM
   #include "amd/quant_utils.cuh"
-  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
-  #define MAYBE_HOST_DEVICE
 #endif
 
 // Determines the preferred FP8 type for the current platform.
@@ -31,29 +23,6 @@ static bool is_fp8_ocp() {
 #endif
 }
 
-template <typename T>
-struct fp8_e4m3_adjusted_max;
-
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
-  static constexpr c10::Float8_e4m3fn val() {
-    return std::numeric_limits<c10::Float8_e4m3fn>::max();
-  }
-};
-
-// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
-// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
-  static constexpr c10::Float8_e4m3fnuz val() {
-    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
-  }
-};
-
-template <typename T>
-MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
-    fp8_e4m3_adjusted_max<T>::val();
-
 namespace vllm {
 
 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@@ -76,8 +45,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
     x = val / scale;
   }
 
-  float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                 fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -123,7 +92,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
   }
 }
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 1be89c504bfeb..2b6ab7fcec902 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -14,8 +14,7 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   float rms = 0.0f;
   float token_scale = 0.0f;
@@ -27,8 +26,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
   // Compute scale
   vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                      has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -50,8 +49,7 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   // For vectorization, token_input and token_output pointers need to be
   // aligned at 8-byte and 4-byte addresses respectively.
@@ -60,8 +58,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
   if (can_vectorize) {
     return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                 has_residual>(
-        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
-        hidden_size, residual);
+        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
+        residual);
   }
 
   float rms = 0.0f;
@@ -72,8 +70,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
                                             var_epsilon, residual);
   // Compute Scale
   vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -105,11 +103,6 @@ void rms_norm_dynamic_per_token_quant_dispatch(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  const float min_scaling_factor =
-      out.dtype() == torch::kInt8
-          ? std::numeric_limits<float>::epsilon()
-          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
-
   if (residual.has_value()) {
     VLLM_DISPATCH_QUANT_TYPES(
         out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@@ -119,8 +112,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size,
-                  residual->data_ptr<scalar_in_t>());
+                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
         });
 
   } else {
@@ -132,7 +124,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+                  var_epsilon, hidden_size, nullptr);
         });
   }
 }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index b5cea98f7706e..e6d23cd24e178 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -5,6 +5,7 @@
  */
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 #include "quant_conversions.cuh"
 
 #ifndef USE_ROCM
@@ -51,11 +52,11 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   float block_absmax_val_maybe = 0.0f;
   for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@@ -83,7 +84,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // Shared memory store
     all_token_scales[blockIdx.x] = scale;  // Global output store
   }
@@ -184,7 +185,7 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
@@ -200,7 +201,7 @@ __device__ void compute_dynamic_per_token_scales(
         reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
   }
 
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
@@ -248,7 +249,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // shared memory store
     all_token_scales[blockIdx.x] = scale;  // global output store
   }
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index 9ac7b188f5181..7c10aaa81cf7b 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
 
 template <typename fp8_type>
 static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
-  float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                       fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float const r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
   return static_cast<fp8_type>(r);
 }
 
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
index 41fc032ff1a56..9d355003ef91d 100644
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dfloat2 v;
     dequantize_kernel(vx, ib, iqs, v);
 
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
 }
 
 template<typename dst_t>
@@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 
     half dall = __low2half(x[i].dm);
     half dmin = __high2half(x[i].dm);
-    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
-    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
-    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
-    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
 }
 
 template<typename dst_t>
@@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
     const uint8_t * q = x[i].qs + 32*n;
     const uint8_t * hm = x[i].hmask;
 
-    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
 }
 
 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc));
     const half m2 = __hmul(dmin, __int2half_rn(m));
     for (int l = 0; l < n; ++l) {
-        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
-        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
     }
 }
 
@@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
 
     uint8_t   hm  = 1 << (2*il);
-    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
-    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
     hm <<= 1;
-    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
-    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
 }
 
 template<typename dst_t>
@@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
     const uint8_t   qh = x[i].qh[32*ip + il];
     const int8_t  * sc = x[i].scales + is;
 
-    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
-    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
-    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
-    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
 }
 
 template<typename dst_t>
@@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
     const uint32_t aux32 = q2[2] | (q2[3] << 16);
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
     const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 }
 
@@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
     const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
     const uint8_t signs = x[i].signs[4*ib + il];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[ib].qs + 4*il;
     const float d = __half2float(x[ib].d);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 
 }
@@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
     const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 }
 
@@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }
 
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
     switch (type) {
         case 2:
             return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index d42205a6571db..6bef5db3ccf15 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef half dfloat; // dequantize float
 typedef half2 dfloat2;
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
+template<typename dst_t>
+using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 typedef void (*load_tiles_cuda_t)(
@@ -1075,6 +1076,25 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
 
 // Utility function
 
+template<typename dst_t>
+static __device__ __forceinline__ dst_t convert_from_half(half val) {
+    return val;
+}
+
+template<>
+__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __float2bfloat16(__half2float(val));
+#else
+    return __half2float(val);
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+
+template<>
+__device__ __forceinline__ float convert_from_half<float>(half val) {
+    return __half2float(val);
+}
+
 #if defined(USE_ROCM)
 
 #ifndef __has_builtin
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index dbbb97e6fb3a9..56b78f1834d15 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n) {
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
   at::Tensor DW = torch::empty({m, n}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
-  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
   return DW;
 }
 
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
index c10c59d7a38a7..df9b84abcc134 100644
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_0 64
+  #define MOE_X_Q4_0 8
   #define MOE_Y_Q4_0 128
   #define NWARPS_Q4_0 8
 #else
@@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_1 64
+  #define MOE_X_Q4_1 8
   #define MOE_Y_Q4_1 128
   #define NWARPS_Q4_1 8
 #else
@@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_0 64
+  #define MOE_X_Q5_0 8
   #define MOE_Y_Q5_0 128
   #define NWARPS_Q5_0 8
 #else
@@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_1 64
+  #define MOE_X_Q5_1 8
   #define MOE_Y_Q5_1 128
   #define NWARPS_Q5_1 8
 #else
@@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q8_0 64
+  #define MOE_X_Q8_0 8
   #define MOE_Y_Q8_0 128
   #define NWARPS_Q8_0 8
 #else
@@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q2_K 64
+  #define MOE_X_Q2_K 8
   #define MOE_Y_Q2_K 128
   #define NWARPS_Q2_K 8
 #else
@@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q3_K 64
+  #define MOE_X_Q3_K 8
   #define MOE_Y_Q3_K 128
   #define NWARPS_Q3_K 8
 #else
@@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_K 64
+  #define MOE_X_Q4_K 8
   #define MOE_Y_Q4_K 128
   #define NWARPS_Q4_K 8
 #else
@@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_K 64
+  #define MOE_X_Q5_K 8
   #define MOE_Y_Q5_K 128
   #define NWARPS_Q5_K 8
 #else
@@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q6_K 64
+  #define MOE_X_Q6_K 8
   #define MOE_Y_Q6_K 128
   #define NWARPS_Q6_K 8
 #else
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 14d397d03e135..83bbd1e6816a8 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                 num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
       }                                                                        \
     }
 
@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
     if (false) {
     }
     GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index 74ccbac57bd3c..f3b44641e77ee 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -9,7 +9,11 @@
 #include <cuda_runtime.h>
 #include <iostream>
 
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
 
 // Marlin params
 
@@ -23,6 +27,7 @@ static constexpr int pipe_stages =
 
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
+static constexpr int max_thread_n = 256;
 
 static constexpr int tile_size = 16;
 static constexpr int max_par = 16;
@@ -84,4 +89,4 @@ __device__ inline void cp_async_wait() {
 
 #endif
 
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index be06c09bee331..cc16054814342 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -5,7 +5,11 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
 
 template <typename scalar_t>
 class ScalarType {};
@@ -54,7 +58,7 @@ class ScalarType<nv_bfloat16> {
   using FragS = Vec<nv_bfloat162, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
   static __device__ float inline num2float(const nv_bfloat16 x) {
     return __bfloat162float(x);
   }
@@ -74,6 +78,6 @@ class ScalarType<nv_bfloat16> {
 #endif
 };
 
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
 
 #endif
diff --git a/csrc/quantization/utils.cuh b/csrc/quantization/utils.cuh
new file mode 100644
index 0000000000000..73055a1528744
--- /dev/null
+++ b/csrc/quantization/utils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+/**
+ * Quantization utilities including:
+ *   Adjusted maximum values for qtypes.
+ *   Minimum scaling factors for qtypes.
+ */
+
+#include <cmath>
+#include <torch/types.h>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct quant_type_max {
+  static constexpr T val() { return std::numeric_limits<T>::max(); }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct quant_type_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
+    quant_type_max<T>::val();
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct min_scaling_factor {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return 1.0f / (quant_type_max_v<T> * 512.0f);
+  }
+};
+
+template <>
+struct min_scaling_factor<int8_t> {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return std::numeric_limits<float>::epsilon();
+  }
+};
\ No newline at end of file
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 8ab2af22f4d0c..2c3cae95e7f55 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,    
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   const int rowid = laneid / 16;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
   const auto partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // fetch Q in shared across warps and then write to registers
   const int local_qhead_idx = 4 * warpid + rowid;
   const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
   const scalar_t* q_ptr =
-      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
 
   const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
   if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
   const int lane4id = laneid % 4;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
   const auto partition_idx = blockIdx.y;
   const auto partition_size = blockDim.x;
   const auto max_num_partitions = gridDim.y;
@@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     }
 
     // fetch q elements
-    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
+    const int64_t query_start_off = static_cast<int64_t>(
+        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
     const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
     const int qhead_elemh8 = laneid / 4;
 
@@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   const auto num_heads = gridDim.x;
   const auto head_idx = blockIdx.x;
   const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
   [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
 
-  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
                   static_cast<int64_t>(head_idx) * HEAD_SIZE;
   if constexpr (std::is_same<OUTT, bit8_t>::value) {
     out_ptr[threadIdx.x] =
@@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   UNREACHABLE_CODE
 }
@@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
-                                          GQA_RATIO>                          \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                         GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
 #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
   paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
                                       PARTITION_SIZE, NPAR_LOOPS>    \
       <<<reduce_grid, reduce_block, 0, stream>>>(                    \
           out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, max_num_partitions);
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions);
 
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
           int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@@ -1559,9 +1591,10 @@ void paged_attention_custom_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_seqs = query.size(0);
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
   int max_num_blocks_per_seq = block_tables.size(1);
@@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
   // NOTE: alibi_slopes is optional.
   const float* alibi_slopes_ptr =
       alibi_slopes
@@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher(
   paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                   PSIZE, ALIBI_ENABLED>(                    \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-      alibi_slopes, k_scale, v_scale);
+      num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+      max_context_len, alibi_slopes, k_scale, v_scale);
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
                                    PSIZE)                                      \
@@ -1750,6 +1790,7 @@ void paged_attention(
     double scale,
     torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& context_lens, // [num_seqs]
+    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
     int64_t block_size, int64_t max_context_len,
     const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index ba161951772ad..b90cfdc617afd 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -2,13 +2,23 @@
 
 #include <torch/all.h>
 
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block);
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount);
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
+
 void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
                      torch::Tensor& query, torch::Tensor& key_cache,
                      torch::Tensor& value_cache, int64_t num_kv_heads,
                      double scale, torch::Tensor& block_tables,
-                     torch::Tensor& context_lens, int64_t block_size,
-                     int64_t max_context_len,
+                     torch::Tensor& context_lens,
+                     const std::optional<torch::Tensor>& query_start_loc,
+                     int64_t block_size, int64_t max_context_len,
                      const std::optional<torch::Tensor>& alibi_slopes,
                      const std::string& kv_cache_dtype, torch::Tensor& k_scale,
                      torch::Tensor& v_scale);
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
new file mode 100644
index 0000000000000..29dbbe8e35e8f
--- /dev/null
+++ b/csrc/rocm/skinny_gemms.cu
@@ -0,0 +1,1600 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <stdexcept>
+#include <algorithm>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "quantization/fp8/common.cuh"
+
+#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
+  #define __HIP__MI300_MI250__
+#endif
+
+#if defined(__HIPCC__) && defined(__gfx942__)
+  #define __HIP__MI300__
+#endif
+
+#if defined(NDEBUG)
+  #undef NDEBUG
+  #include <assert.h>
+  #define UNREACHABLE_CODE assert(false);
+  #define NDEBUG
+#else
+  #define UNREACHABLE_CODE assert(false);
+#endif
+
+template <typename T>
+struct scalar {};
+
+template <typename T>
+struct scalar2 {};
+
+template <typename T>
+__device__ __forceinline__ float2 __s22float2(T v);
+
+template <typename T>
+__device__ __forceinline__ T __float2s(float v);
+
+template <typename T>
+__device__ __forceinline__ T __float22s2_rn(float2 v);
+
+// Definitions and cvt functions for fp16
+template <>
+struct scalar<c10::Half> {
+  using type = half;
+};
+
+template <>
+struct scalar2<c10::Half> {
+  using type = __half2;
+};
+
+template <>
+__device__ __forceinline__ half __float2s(float v) {
+  return __float2half(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__half2 v) {
+  return __half22float2(v);
+}
+
+template <>
+__device__ __forceinline__ __half2 __float22s2_rn(float2 v) {
+  return __float22half2_rn(v);
+}
+
+// Definitions and cvt functions for bf16
+template <>
+struct scalar<c10::BFloat16> {
+  using type = __hip_bfloat16;
+};
+
+template <>
+struct scalar2<c10::BFloat16> {
+  using type = __hip_bfloat162;
+};
+
+template <>
+__device__ __forceinline__ __hip_bfloat16 __float2s(float v) {
+  return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__hip_bfloat162 v) {
+  return __bfloat1622float2(v);
+}
+
+template <>
+__device__ __forceinline__ __hip_bfloat162 __float22s2_rn(float2 v) {
+  return __float22bfloat162_rn(v);
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <typename scalar_t, int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
+                               scalar_t* out_c, const int K) {
+  using scalar2_t = typename scalar2<scalar_t>::type;
+  auto af4 = reinterpret_cast<const float4*>(in_a);
+  auto bf4 = reinterpret_cast<const scalar2_t*>(in_b);
+  auto c = reinterpret_cast<scalar2_t*>(out_c);
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * K / 8;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / num_warps;
+  const int qthreadid = threadid % num_warps;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float acc[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t acch2;
+  scalar2_t oval;
+
+  // As we later use warp shuffle operations, we may have more threads in the
+  // block than the actual available data, hence the if guard here.
+  if (threadid * 8 < K) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      // rowA_elem4[i] holds 8 * half numbers seen as a single float4.
+      rowA_elem4[i] = load_ntmprl(&af4[row_addr + threadid + K / 8 * i]);
+    }
+  }
+
+  colB_elem4x = bf4[threadid * 4 + 0];
+  colB_elem4y = bf4[threadid * 4 + 1];
+  colB_elem4z = bf4[threadid * 4 + 2];
+  colB_elem4w = bf4[threadid * 4 + 3];
+
+  scalar2_t Af2;
+  scalar2_t Bf2;
+  float2 S;
+
+  auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
+  scalar2_t* ah2lptr;
+
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    // Multiply-add on 8 scalar_t.
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __s22float2(acch2);
+
+    // See comment above concerning the if guard.
+    acc[i] = (threadid * 8 < K ? S.x + S.y : 0.f);
+  }
+
+// all reduce across warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+    for (int mask = num_warps / 2; mask >= 1; mask /= 2) {
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], num_warps);
+
+    if (lane % (num_warps * 2) == 0) {
+      oval = __float22s2_rn<scalar2_t>(make_float2(acc[qwarpid], oval2));
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] = oval;
+    }
+  }
+}
+
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  auto N = in_b.size(0);
+
+  TORCH_CHECK(N == 1, "Row number of activation tensor must be 1.");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(in_b.dtype() == torch::kFloat16 ||
+              in_b.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N, M}, torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  // NUM_TREADS need to be a multiple of WARP_SIZE, as we are using warp shuffle
+  // operations.
+  const int NUM_THREADS =
+      K * 2 / 16 % WARP_SIZE == 0
+          ? K * 2 / 16
+          : K * 2 / 16 + (WARP_SIZE - K * 2 / 16 % WARP_SIZE);
+
+  int NUM_BLOCKS = M / rows_per_block;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_b));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // call the kernel function...
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "LLGemm1", [&] {
+    auto a_ptr = in_a.data_ptr<scalar_t>();
+    auto b_ptr = in_b.data_ptr<scalar_t>();
+    auto c_ptr = out_c.data_ptr<scalar_t>();
+    if (rows_per_block == 2) {
+      LLGemm1_kernel<scalar_t, 2>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 4) {
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 8) {
+      LLGemm1_kernel<scalar_t, 8>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 16) {
+      LLGemm1_kernel<scalar_t, 16>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else {
+      NUM_BLOCKS = M / 4;
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    }
+  });
+
+  return out_c;
+}
+
+#define DOT2C(V0, V2, V3)                                                     \
+  if constexpr (std::is_same_v<scalar_t, half>) {                             \
+    asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
+  } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {            \
+    float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *             \
+               __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));              \
+    V0 += (s.x + s.y);                                                        \
+  }
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] fits LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        // Do the matrix multiplication of activation and weight matrix
+        // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b])
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
+          C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] marginally exceeds LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                 const scalar_t* __restrict__ A, scalar_t* C,
+                 const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+        for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+          uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+          uint32_t k_ = k + threadIdx.x * A_CHUNK;
+          if (k_ >= K) break;
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                             const scalar_t* __restrict__ A, scalar_t* C,
+                             const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets big A[] cases, where it is much larger than LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  #define PCML
+  #ifndef PCML
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+  #endif
+
+  #define TUC (THRDS * UNRL * A_CHUNK)
+  uint32_t kBase = 0;
+  // find biggest k size that fits in LDS
+  uint32_t kFit = (32 * 1024) / N;
+  // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
+  // of TUC
+  kFit = (kFit % TUC == 0)
+             ? kFit
+             : (kFit - kFit % TUC);  // round up to multiple of TUC
+  // if (kFit == 0) kFit = TUC;
+  kFit = min(kFit, K);
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  #ifdef PCML
+  int YW = (YTILE * _WvPrGrp);
+  uint32_t Mrndp = (M % YW == 0) ? M : (M - M % YW + YW);
+  while (m < Mrndp) {
+  #else
+  while (m < M) {
+  #endif
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #ifdef PCML
+      if ((k1 == 0) || (k1 == kBase + kFit)) {  // load next chunk of A[] to LDS
+        if (k1 != 0) kBase += kFit;
+        __syncthreads();
+        for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) {
+          uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+          if (kBase + kOff >= K) break;
+          if (kOff >= kFit) break;
+          for (uint32_t n = 0; n < N; n++) {
+            uint32_t k_in = kBase + n * K + kOff;
+            uint32_t k_ot = n * kFit + kOff;
+            *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in]));
+          }
+        }
+        __syncthreads();
+      }
+      if (m >= M) continue;
+  #endif
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+  #ifdef PCML
+          bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n])));
+  #else
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+  #endif
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+  #ifdef PCML
+    if (m >= M) {
+      m += CuCount * _WvPrGrp * YTILE;
+      kBase = 0;
+      continue;
+    }
+  #endif
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+    kBase = 0;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+int mindiv(int N, int div1, int div2) {
+  int nPrRnd = div1 * div2;
+  int rnds0 = N / nPrRnd;
+  nPrRnd -= div1 * 3;
+  int rnds3 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds4 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds5 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds6 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds7 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds8 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds9 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rtn = div2;
+  if (rnds0 == rnds3) rtn = div2 - 3;
+  if (rnds0 == rnds4) rtn = div2 - 4;
+  if (rnds0 == rnds5) rtn = div2 - 5;
+  if (rnds0 == rnds6) rtn = div2 - 6;
+  if (rnds0 == rnds7) rtn = div2 - 7;
+  if (rnds0 == rnds8) rtn = div2 - 8;
+  if (rnds0 == rnds9) rtn = div2 - 9;
+  return rtn;
+}
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                 _N)                                                          \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else if (K_in * N_in <= 32 * 1024 * 1.2) {                              \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
+      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else {                                                                  \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
+      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    }                                                                         \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    switch (N_in) {
+      case 1:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        break;
+      case 2:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        break;
+      case 3:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        break;
+      case 4:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
+                      const fp8_t* __restrict__ A, scalar_t* C,
+                      const float* __restrict__ s_A,
+                      const float* __restrict__ s_B, const int _WvPrGrp,
+                      const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
+      }
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) {
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        if (k >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
+                                  scalar_t* C, const float* __restrict__ s_A,
+                                  const float* __restrict__ s_B,
+                                  const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
+                  const fp8_t* __restrict__ A, scalar_t* C,
+                  const float* __restrict__ s_A, const float* __restrict__ s_B,
+                  const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        for (int y = 0; y < YTILE; ++y) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 64 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
+                              scalar_t* C, const float* __restrict__ s_A,
+                              const float* __restrict__ s_B, const int _WvPrGrp,
+                              const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b,
+               const int64_t CuCount) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+  auto Kp_in = in_a.stride(0);
+  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
+  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
+              out_c.dtype() == torch::kBFloat16);
+
+  dim3 grid(CuCount);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                  _N)                                                          \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) {                 \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    } else {                                                                   \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    }                                                                          \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto s_a = scale_a.data_ptr<float>();
+    auto s_b = scale_b.data_ptr<float>();
+    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
+      auto a_ptr = in_a.data_ptr<fp8_t>();
+      auto b_ptr = in_b.data_ptr<fp8_t>();
+      switch (N_in) {
+        case 1:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          break;
+        case 2:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          break;
+        case 3:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          break;
+        case 4:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          break;
+        default:
+          throw std::runtime_error(
+              "Unsupported N value: " + std::to_string(M_in) + "," +
+              std::to_string(K_in) + "," + std::to_string(N_in));
+      }
+    });
+  });
+}
\ No newline at end of file
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index a5d2e2f97a3ed..4ac6fd1e99408 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -14,6 +14,24 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
   // vLLM custom ops for rocm
 
+  // Custom gemm op for matrix-vector multiplication
+  rocm_ops.def(
+      "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> "
+      "Tensor");
+  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
+
+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
+
+  // wvSplitK for fp8
+  rocm_ops.def(
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
+      "          Tensor scale_b, int CuCount) -> ()");
+  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
+
   // Custom attention op
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
@@ -23,7 +41,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
       "                Tensor query, Tensor key_cache,"
       "                Tensor value_cache, int num_kv_heads,"
       "                float scale, Tensor block_tables,"
-      "                Tensor context_lens, int block_size,"
+      "                Tensor context_lens,"
+      "                Tensor? query_start_loc,"
+      "                int block_size,"
       "                int max_context_len,"
       "                Tensor? alibi_slopes,"
       "                str kv_cache_dtype,"
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index eb3a2c911d55e..b6ff6a006c028 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -31,6 +31,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
 
+  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
+  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
+           &get_cuda_view_from_cpu_tensor);
+
   // Attention ops
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
@@ -60,6 +64,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
+#ifndef USE_ROCM
+  // Merge attn states
+  // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+  // can be used to combine partial attention results (in the split-KV case)
+  ops.def(
+      "merge_attn_states("
+      "    Tensor! output,"
+      "    Tensor!? output_lse,"
+      "    Tensor prefix_output,"
+      "    Tensor prefix_lse,"
+      "    Tensor suffix_output,"
+      "    Tensor suffix_lse) -> ()");
+  ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
+#endif
+
   // Activation ops
   // Activation function used in SwiGLU.
   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
@@ -291,7 +310,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
   // Dequantization for GGML.
-  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
+  ops.def(
+      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
+      "dtype) -> Tensor");
   ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
 
   // mmvq kernel for GGML.
@@ -365,6 +386,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass grouped gemm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported);
+
+  // CUTLASS w8a8 grouped GEMM
+  ops.def(
+      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
+      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
+      "               Tensor problem_sizes, Tensor a_strides, "
+      "               Tensor b_strides, Tensor c_strides) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
+
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM. It takes topk_ids as an input, and computes expert_offsets
+  // (token start indices of each expert). In addition to this, it computes
+  // problem sizes for each expert's multiplication used by the two mms called
+  // from fused MoE operation, and arrays with permutations required to shuffle
+  // and de-shuffle the input/output of the fused operation.
+  ops.def(
+      "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
+      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
+      "                        Tensor! input_permutation, "
+      "                        Tensor! output_permutation, int num_experts, "
+      "                        int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
@@ -581,12 +631,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                   &get_max_shared_memory_per_block_device_attribute);
 }
 
-#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   // Custom all-reduce kernels
   custom_ar.def(
       "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "int rank, bool full_nvlink) -> int");
+      "int rank, bool fully_connected) -> int");
   custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
   custom_ar.def(
       "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@@ -599,7 +648,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   custom_ar.def("register_buffer", &register_buffer);
   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
   custom_ar.def("register_graph_buffers", &register_graph_buffers);
+
+  custom_ar.def("allocate_shared_buffer_and_handle",
+                &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
-#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/Dockerfile b/docker/Dockerfile
similarity index 99%
rename from Dockerfile
rename to docker/Dockerfile
index d1ecef586d50b..e8e18df1bb496 100644
--- a/Dockerfile
+++ b/docker/Dockerfile
@@ -240,6 +240,8 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
     uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
 
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
diff --git a/Dockerfile.arm b/docker/Dockerfile.arm
similarity index 100%
rename from Dockerfile.arm
rename to docker/Dockerfile.arm
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
new file mode 100644
index 0000000000000..c647d9036f400
--- /dev/null
+++ b/docker/Dockerfile.cpu
@@ -0,0 +1,142 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+#
+# Build targets:
+#   vllm-openai (default): used for serving deployment
+#   vllm-test: used for CI tests
+#   vllm-dev: used for development
+#
+# Build arguments:
+#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#
+
+######################### BASE IMAGE #########################
+FROM ubuntu:22.04 AS base
+
+WORKDIR /workspace/
+
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+
+ENV LD_PRELOAD=""
+
+# Install minimal dependencies and uv
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV CCACHE_DIR=/root/.cache/ccache
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+ENV UV_HTTP_TIMEOUT=500
+
+# Install Python dependencies 
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/cpu.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+######################### BUILD IMAGE #########################
+FROM base AS vllm-build
+
+ARG GIT_REPO_CHECK=0
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
+
+COPY . .
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+
+######################### DEV IMAGE #########################
+FROM vllm-build AS vllm-dev
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get install -y --no-install-recommends vim numactl
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/dev.txt && \
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+ENTRYPOINT ["bash"]
+
+######################### TEST IMAGE #########################
+FROM base AS vllm-test
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
+    uv pip install -r requirements/test.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ADD ./tests/ ./tests/
+ADD ./examples/ ./examples/
+ADD ./benchmarks/ ./benchmarks/
+ADD ./vllm/collect_env.py .
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+ENTRYPOINT ["bash"]
+
+######################### RELEASE IMAGE #########################
+FROM base AS vllm-openai
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/docker/Dockerfile.hpu
similarity index 84%
rename from Dockerfile.hpu
rename to docker/Dockerfile.hpu
index 48211c88f872b..224f142b5ff44 100644
--- a/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 
 COPY ./ /workspace/vllm
 
diff --git a/Dockerfile.neuron b/docker/Dockerfile.neuron
similarity index 90%
rename from Dockerfile.neuron
rename to docker/Dockerfile.neuron
index 067645906366e..2b63fe301bac6 100644
--- a/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
-RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
+RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
+RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 
 # uninstall transformers-neuronx package explicitly to avoid version conflict
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
new file mode 100644
index 0000000000000..0063712e47818
--- /dev/null
+++ b/docker/Dockerfile.nightly_torch
@@ -0,0 +1,307 @@
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
+
+# for torch nightly, cuda >=12.6 is required,
+# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
+ARG CUDA_VERSION=12.8.0
+#
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version \
+    && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# install torch nightly
+ARG PINNED_TORCH_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$PINNED_TORCH_VERSION" ]; then \
+      pkgs="$PINNED_TORCH_VERSION"; \
+    else \
+      pkgs="torch torchaudio torchvision"; \
+    fi && \
+    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+# must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Build xformers with cuda and torch nightly
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# todo(elainewy): cache xformers build result for faster build
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat torch_build_versions.txt
+
+# cuda arch list used by torch
+# can be useful for `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+#################### WHEEL BUILD IMAGE ####################
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+# prepare for environment starts
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# get the nightly torch version used in the build to make sure the version is the same
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# install the vllm wheel
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system vllm-dist/*.whl --verbose
+
+# install xformers again for the new environment
+RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
+
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
+
+
+# build flashinfer for torch nightly from source around 10 mins
+# release version: v0.2.2.post1
+# todo(elainewy): cache flashinfer build result for faster build
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo "git clone flashinfer..." \
+    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && cd flashinfer \
+    && git checkout v0.2.2.post1 \
+    && git submodule update --init --recursive \
+    && echo "finish git clone flashinfer..." \
+    && rm -rf build \
+    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
+    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-dist/*.whl --verbose
+
+# install common packages
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+COPY tests/ tests/
+
+# install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+#################### UNITTEST IMAGE #############################
+
diff --git a/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
similarity index 93%
rename from Dockerfile.ppc64le
rename to docker/Dockerfile.ppc64le
index 913c289adc01e..ec979227871c6 100644
--- a/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
-    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
@@ -126,13 +126,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM base-builder AS cv-builder
 
 ARG MAX_JOBS
-ARG OPENCV_VERSION=84
+ARG OPENCV_VERSION=86
+# patch for version 4.11.0.86
+ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
     cd opencv-python && \
-    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
+    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
     python -m build --wheel --installer=uv --outdir /opencvwheels/
 
 ###############################################################
@@ -148,9 +151,15 @@ COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
+ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
+    uv pip install maturin && \
+    uv build --wheel --out-dir /hf_wheels/
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@@ -159,7 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 && \
+    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
     uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@@ -238,7 +247,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && python -m pip install -U pip uv --no-cache \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
-    && uv pip install cmake \
+    && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
 
@@ -247,8 +256,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
diff --git a/Dockerfile.rocm b/docker/Dockerfile.rocm
similarity index 98%
rename from Dockerfile.rocm
rename to docker/Dockerfile.rocm
index 841e7978a424f..f9ebb10ca8731 100644
--- a/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -12,7 +12,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
diff --git a/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
similarity index 84%
rename from Dockerfile.rocm_base
rename to docker/Dockerfile.rocm_base
index 38d6a33636eba..1776b26d445ce 100644
--- a/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLASLT_BRANCH="db8e93b4"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
-ARG AITER_BRANCH="21d47a9"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="7e1ed08"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 
 ARG PYTHON_VERSION=3.12
@@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -42,7 +42,7 @@ RUN apt-get update -y \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
 
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@@ -60,7 +60,8 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
     && git checkout ${HIPBLASLT_BRANCH} \
-    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
     && cd build/release \
     && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@@ -110,11 +111,24 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
     && cp /app/vision/dist/*.whl /app/install \
     && cp /app/flash-attention/dist/*.whl /app/install
 
+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
     dpkg -i /install/*deb \
@@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
-
-ARG AITER_REPO
-ARG AITER_BRANCH
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt \
-    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    pip install /install/*.whl
 
 ARG BASE_IMAGE
-ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@@ -154,6 +161,8 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
+ARG AITER_BRANCH
+ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
     && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
     && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/Dockerfile.s390x b/docker/Dockerfile.s390x
similarity index 85%
rename from Dockerfile.s390x
rename to docker/Dockerfile.s390x
index 5a84dc12d8f71..128929ac33311 100644
--- a/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd ../../python && \
     export PYARROW_PARALLEL=4 && \
     export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements/build.txt && \
+    uv pip install -r requirements-build.txt && \
     python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
 
 FROM python-install AS numa-build
@@ -96,6 +96,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
     python setup.py bdist_wheel
 
+FROM python-install AS hf-xet-builder
+# Install hf-xet
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/huggingface/xet-core.git && \
+    cd xet-core/hf_xet/ && \
+    uv pip install maturin patchelf && \
+    python -m maturin build --release --out dist && \
+    mkdir -p /tmp/hf-xet/dist && \
+    cp dist/*.whl /tmp/hf-xet/dist/
+
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@@ -120,12 +136,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
+    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
+     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
+        $HF_XET_WHL_FILE \
         --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
@@ -149,4 +168,5 @@ USER 2000
 WORKDIR /home/vllm
 
 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+
diff --git a/Dockerfile.tpu b/docker/Dockerfile.tpu
similarity index 100%
rename from Dockerfile.tpu
rename to docker/Dockerfile.tpu
diff --git a/Dockerfile.xpu b/docker/Dockerfile.xpu
similarity index 100%
rename from Dockerfile.xpu
rename to docker/Dockerfile.xpu
diff --git a/docs/README.md b/docs/README.md
index 74e05ce02636b..dcd5e759dfa88 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,19 +2,42 @@
 
 ## Build the docs
 
-```bash
-# Install dependencies.
-pip install -r ../requirements/docs.txt
+- Make sure in `docs` directory
 
-# Build the docs.
+```bash
+cd docs
+```
+
+- Install the dependencies:
+
+```bash
+pip install -r ../requirements/docs.txt
+```
+
+- Clean the previous build (optional but recommended):
+
+```bash
 make clean
+```
+
+- Generate the HTML documentation:
+
+```bash
 make html
 ```
 
 ## Open the docs with your browser
 
+- Serve the documentation locally:
+
 ```bash
 python -m http.server -d build/html/
 ```
 
-Launch your browser and open localhost:8000.
+This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
+
+If port 8000 is already in use, you can specify a different port, for example:
+
+```bash
+python -m http.server 3000 -d build/html/
+```
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index be0b2a388e404..58bc2ebb9614b 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -10,8 +10,8 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
-    script.setAttribute("runllm-position-y", "20%");
-    script.setAttribute("runllm-position-x", "3%");
+    script.setAttribute("runllm-position-y", "120px");
+    script.setAttribute("runllm-position-x", "20px");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;
diff --git a/docs/source/assets/deployment/open_webui.png b/docs/source/assets/deployment/open_webui.png
new file mode 100644
index 0000000000000..fe9a7e15ea71d
Binary files /dev/null and b/docs/source/assets/deployment/open_webui.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
index 8849ca0237c39..d5a165ff6944b 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-1.png and b/docs/source/assets/design/v1/prefix_caching/example-time-1.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
index 71b9e9b60ab9a..d753a406bdb9a 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-3.png and b/docs/source/assets/design/v1/prefix_caching/example-time-3.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
index 017df1657c22e..d463248a3b1e9 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-4.png and b/docs/source/assets/design/v1/prefix_caching/example-time-4.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
index b80dd5b9949dc..231ebc6199faf 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-5.png and b/docs/source/assets/design/v1/prefix_caching/example-time-5.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
index fbd7138596e89..6ded9170e8e83 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-6.png and b/docs/source/assets/design/v1/prefix_caching/example-time-6.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
index fc33ef50d4fdb..0b536de5a53f2 100644
Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-7.png and b/docs/source/assets/design/v1/prefix_caching/example-time-7.png differ
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index efb4f692972b5..085918bed2b09 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,9 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
+- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index fb93e65673dff..b8a1ddbe38794 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,6 +22,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b02b84826c9f2..a83ad764125c5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -103,6 +103,11 @@ myst_url_schemes = {
         "title": "Pull Request #{{path}}",
         "classes": ["github"],
     },
+    "gh-project": {
+        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
+        "title": "Project #{{path}}",
+        "classes": ["github"],
+    },
     "gh-dir": {
         "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
         "title": "{{path}}",
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 96674805df534..90b9a33cfbe62 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,6 +1,6 @@
 # Dockerfile
 
-We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@@ -28,7 +28,7 @@ The edges of the build graph represent:
   > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
   >
   > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
   > ```
   >
   > or in case you want to run it directly with the docker image:
@@ -43,7 +43,7 @@ The edges of the build graph represent:
   >    --output png \
   >    --dpi 200 \
   >    --max-label-length 50 \
-  >    --filename Dockerfile \
+  >    --filename docker/Dockerfile \
   >    --legend
   > ```
   >
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 9cbfc32991f09..b42536f054d76 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -79,6 +79,17 @@ Further update the model as follows:
             return inputs_embeds
     ```
 
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_language_model(self) -> torch.nn.Module:
+            # Change `language_model` according to your implementation.
+            return self.language_model
+    ```
+
 - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
   ```diff
@@ -110,17 +121,19 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
     return {"image": None, "video": 1}
 ```
 
-### Maximum number of placeholder feature tokens
+## 3. Specify dummy inputs
 
-Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
-to return the maximum number of placeholder feature tokens per input item for each modality.
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
+HF processing as well as memory profiling.
 
-When calling the model, the output embeddings from the visual encoder are assigned to the input positions
-containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
-to the size of the output embeddings.
+### For memory profiling
 
-:::::{tab-set}
-::::{tab-item} Basic example: LLaVA
+Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -229,41 +242,50 @@ def get_num_image_tokens(
 ```
 
 Notice that the number of image tokens doesn't depend on the image width and height.
-So, we can calculate the maximum number of image tokens using any image size:
+We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 
 ```python
+# NOTE: In actuality, this is usually implemented as part of the
+# model's subclass of `BaseProcessingInfo`, but we show it as is
+# here for simplicity.
 def get_image_size_with_most_features(self) -> ImageSize:
     hf_config = self.get_hf_config()
     width = height = hf_config.image_size
     return ImageSize(width=width, height=height)
 
-def get_max_image_tokens(self) -> int:
-    target_width, target_height = self.get_image_size_with_most_features()
-
-    return self.get_num_image_tokens(
-        image_width=target_width,
-        image_height=target_height,
-    )
-```
-
-And thus, we can override the method as:
-
-```python
-def get_mm_max_tokens_per_item(
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
-    return {"image": self.get_max_image_tokens()}
+) -> MultiModalDataDict:
+    num_images = mm_counts.get("image", 0)
+
+    target_width, target_height = \
+        self.info.get_image_size_with_most_features()
+
+    return {
+        "image":
+        self._get_dummy_images(width=target_width,
+                               height=target_height,
+                               num_images=num_images)
+    }
+```
+
+For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
+
+```python
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+
+    return image_token * num_images
 ```
 
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 :::
 
-::::
-
-::::{tab-item} Non-consecutive feature tokens: Fuyu
+:::{tab-item} No input placeholders: Fuyu
 :sync: fuyu
 
 Looking at the code of HF's `FuyuForCausalLM`:
@@ -383,213 +405,42 @@ num_patches_per_dim_w = image_width // patch_width
 num_patches = num_patches_per_dim_h * num_patches_per_dim_w
 ```
 
-We can calculate this in vLLM using this code:
-
-```python
-def get_num_image_patches(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return ncols * nrows
-```
-
-These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
-inserts newline tokens (`|NEWLINE|`) as shown here:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
-tensor_of_image_ids = torch.full(
-    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-)
-patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-assert num_patches == patches.shape[0]
-
-if variable_sized:
-    # Now terminate each line with |NEWLINE|.
-    tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
-    newline_ids = torch.full(
-        [tensor_of_image_ids.shape[0], 1],
-        image_newline_id,
-        dtype=torch.int32,
-        device=image_input.device,
-    )
-    tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
-    tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
-```
-
-So, the layout of tokens for an image is:
-
-```
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-...
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-```
-
-This makes the placeholder tokens non-consecutive in the prompt.
-Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
-
-So overall, the total number of feature tokens is
-
-```python
-def get_num_image_tokens(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return (ncols + 1) * nrows
-```
-
-To calculate the maximum number of image tokens, recall that input images are first resized
-to fit within `image_processor.size`. The maximum possible dimensions of the image before
-being converted into patches is therefore equal to `image_processor.size`.
+These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
+to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
 
 ```python
 def get_image_size_with_most_features(self) -> ImageSize:
     image_processor = self.get_image_processor()
     return ImageSize(width=image_processor.size["width"],
                         height=image_processor.size["height"])
-
-def get_max_image_tokens(self) -> int:
-    target_width, target_height = self.get_image_size_with_most_features()
-
-    return self.get_num_image_tokens(
-        image_width=target_width,
-        image_height=target_height,
-    )
 ```
 
-And thus, we can override the method as:
+Fuyu does not expect image placeholders in the inputs to HF processor, so
+the dummy prompt text is empty regardless of the number of images.
 
 ```python
-def get_mm_max_tokens_per_item(
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    return ""
+```
+
+For the multimodal image profiling data, the logic is very similar to LLaVA:
+
+```python
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
-    return {"image": self.get_max_image_tokens()}
-```
-
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
-This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
-:::
-
-::::
-:::::
-
-## 3. Specify dummy inputs
-
-Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
-HF processing as well as memory profiling.
-
-### For memory profiling
-
-Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
-to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
-the model so that vLLM can reserve the correct amount of memory for it.
-
-Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
-on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
-
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-
-Making use of the `get_image_size_with_most_features` method implemented in Step 2:
-
-```python
-def get_dummy_processor_inputs(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
+    target_width, target_height = \
+        self.info.get_image_size_with_most_features()
     num_images = mm_counts.get("image", 0)
 
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-  
-    hf_config = self.get_hf_config()
-    target_width, target_height = self.info.get_image_size_with_most_features()
-
-    mm_data = {
+    return {
         "image":
         self._get_dummy_images(width=target_width,
                                height=target_height,
                                num_images=num_images)
     }
-
-    return ProcessorInputs(
-        prompt_text=image_token * num_images,
-        mm_data=mm_data,
-    )
-```
-
-:::
-
-:::{tab-item} No input placeholders: Fuyu
-:sync: fuyu
-
-Fuyu does not expect image placeholders in the inputs to HF processor, so
-the dummy prompt text is empty regardless of the number of images.
-Otherwise, the logic of this method is very similar to LLaVA:
-
-```python
-def get_dummy_processor_inputs(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
-    target_width, target_height = \
-        self.info.get_image_size_with_most_features()
-    num_images = mm_counts.get("image", 0)
-
-    mm_data = {
-        "image":
-        self._get_dummy_images(width=target_width,
-                                height=target_height,
-                                num_images=num_images)
-    }
-
-    return ProcessorInputs(
-        prompt_text="",
-        mm_data=mm_data,
-    )
 ```
 
 :::
@@ -860,8 +711,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
-with different `full` and `feature` attributes:
+To assign the vision embeddings to only the image tokens, instead of a string
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +730,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails(
-        full=image_tokens + [bos_token_id],
-        features=image_tokens,
+    return PromptUpdateDetails.select_token_id(
+        image_tokens + [bos_token_id],
+        embed_token_id=_IMAGE_TOKEN_ID,
     )
 ```
 
@@ -914,9 +765,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails(
-            full=image_tokens + [bos_token_id],
-            features=image_tokens,
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
         )
 
     return [
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index a414118316692..31c7059fda364 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -11,6 +11,15 @@ We also believe in the power of community support; thus, answering queries, offe
 
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
 
+## Job Board
+
+Unsure on where to start? Check out the following links for tasks to work on:
+
+- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
+  - [Selected onboarding tasks](gh-project:6)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22)
+  - [Models with multi-modal capabilities](gh-project:10)
+
 ## License
 
 See <gh-file:LICENSE>.
@@ -35,6 +44,12 @@ pre-commit run --all-files
 pytest tests/
 ```
 
+:::{tip}
+Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+
+Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
+:::
+
 :::{note}
 Currently, the repository is not fully checked by `mypy`.
 :::
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 1f60faf40879e..ca56710bc2ef2 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
     --model mistralai/Mistral-7B-v0.1
 ```
 
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+
+```console
+$ podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
+
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
 :::{note}
@@ -34,11 +46,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.0
+FROM vllm/vllm-openai:v0.8.3
 
-# e.g. install the `audio` and `video` optional dependencies
+# e.g. install the `audio` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install vllm[audio,video]==0.8.0
+RUN uv pip install --system vllm[audio]==0.8.3
 ```
 
 :::
@@ -52,7 +64,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest
 
-RUN uv pip install git+https://github.com/huggingface/transformers.git
+RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 ```
 
 :::
@@ -61,11 +73,11 @@ RUN uv pip install git+https://github.com/huggingface/transformers.git
 
 ## Building vLLM's Docker Image from Source
 
-You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
 
 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
 ```
 
 :::{note}
@@ -92,6 +104,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t vllm/vllm-gh200-openai:latest \
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index cb758d3e6d2e4..8be38d56cd286 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -9,6 +9,7 @@ dstack
 helm
 lws
 modal
+open-webui
 skypilot
 triton
 :::
diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/source/deployment/frameworks/open-webui.md
new file mode 100644
index 0000000000000..83e5303a00ef2
--- /dev/null
+++ b/docs/source/deployment/frameworks/open-webui.md
@@ -0,0 +1,29 @@
+(deployment-open-webui)=
+
+# Open WebUI
+
+1. Install the [Docker](https://docs.docker.com/engine/install/)
+
+2. Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
+
+```console
+docker run -d -p 3000:8080 \
+--name open-webui \
+-v open-webui:/app/backend/data \
+-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
+--restart always \
+ghcr.io/open-webui/open-webui:main
+```
+
+1. Open it in the browser: <http://open-webui-host:3000/>
+
+On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+
+:::{image} /assets/deployment/open_webui.png
+:::
diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/source/deployment/integrations/production-stack.md
index e66e8e6a16b29..05f1568306cc9 100644
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
 
 ## Deployment using vLLM production stack
 
-The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
 
 To install the vLLM production stack, run the following commands on your desktop:
 
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 3885956791365..9079cfa8e1b66 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -46,6 +46,7 @@ metadata:
 type: Opaque
 data:
   token: $(HF_TOKEN)
+EOF
 ```
 
 Next, start the vLLM server as a Kubernetes Deployment and Service:
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index 62816f514c00e..bf404f1098c3b 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -69,14 +69,14 @@ server {
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm
+docker build -f docker/Dockerfile . --tag vllm
 ```
 
 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```
 
 (nginxloadbalancer-nginx-docker-network)=
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
index 2a4dac786d4bc..dc92a3c2c511e 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal
 
 ## Prompt Update Detection
 
-One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
+One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
 
 - Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
 - Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 
 ### Dummy text
 
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
 (mm-automatic-prompt-updating)=
 
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index 55dae0bb92d4e..43fe5fe2e5e94 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges.
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14.
+  3.14. In macOS, this is already the default.
 
 - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
   in Python versions prior to 3.14.
@@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges.
 ### Tradeoffs
 
 `fork` is the fastest method, but is incompatible with dependencies that use
-threads.
+threads. If you are under macOS, using `fork` may cause the process to crash.
 
 `spawn` is more compatible with dependencies, but can be problematic when vLLM
 is used as a library. If the consuming code does not use a `__main__` guard (`if
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index b3981b2dc24a7..3f96290798a33 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
-- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
-- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:prompt_tokens_total` - Prompt Tokens
+- `vllm:generation_tokens_total` - Generation Tokens
 - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
 
 Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
 
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+
+```bash
+$ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
+http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
+http_request_size_bytes_count{handler="/v1/completions"} 201.0
+http_response_size_bytes_count{handler="/v1/completions"} 201.0
+http_request_duration_highr_seconds_count 201.0
+http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
+```
+
 ### Multi-process Mode
 
 In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index 3d14a76840d45..ec1f3cb8d64a8 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -15,12 +15,13 @@ Block 3: |<------------------ prefix -------------------->| |<--- block tokens -
 In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
 
 * Parent hash value: The hash value of the parent hash block.
-* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
 * Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
 
-Note 1: We only cache full blocks.
+> **Note 1:** We only cache full blocks.
 
-Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
index 0dadc8089991c..7920131643c26 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
 
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 
@@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu
 
 ## Cudagraph Capture
 
-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
 
 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
 
@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index a71da72e4360a..b5b51095b3a75 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
 
 ## Dynamically serving LoRA Adapters
 
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
 
 Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
 
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`.
 
 ```bash
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 
+### Using API Endpoints
 Loading a LoRA Adapter:
 
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 }'
 ```
 
+### Using Plugins
+Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
+
+You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
+
+You can either install existing plugins or implement your own.
+
+Steps to implement your own LoRAResolver plugin:
+1. Implement the LoRAResolver interface.
+
+    Example of a simple S3 LoRAResolver implementation:
+
+    ```python
+    import os
+    import s3fs
+    from vllm.lora.request import LoRARequest
+    from vllm.lora.resolver import LoRAResolver
+
+    class S3LoRAResolver(LoRAResolver):
+        def __init__(self):
+            self.s3 = s3fs.S3FileSystem()
+            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+
+        async def resolve_lora(self, base_model_name, lora_name):
+            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+
+            # Download the LoRA from S3 to the local path
+            await self.s3._get(
+                s3_path, local_path, recursive=True, maxdepth=1
+            )
+
+            lora_request = LoRARequest(
+                lora_name=lora_name,
+                lora_path=local_path,
+                lora_int_id=abs(hash(lora_name))
+            )
+            return lora_request
+    ```
+
+2. Register LoRAResolver plugin.
+
+     ```python
+    from vllm.lora.resolver import LoRAResolverRegistry
+
+    s3_resolver = S3LoRAResolver()
+    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
+    ```
+
+    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
+
 ## New format for `--lora-modules`
 
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md
new file mode 100644
index 0000000000000..2901f760d3e4c
--- /dev/null
+++ b/docs/source/features/quantization/bitblas.md
@@ -0,0 +1,48 @@
+(bitblas)=
+
+# BitBLAS
+
+vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+
+:::{note}
+Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
+:::
+
+Below are the steps to utilize BitBLAS with vLLM.
+
+```console
+pip install bitblas>=0.1.0
+```
+
+vLLM reads the model's config file and supports pre-quantized checkpoints.
+
+You can find pre-quantized models on:
+
+- [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas)
+- [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq)
+
+Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
+
+## Read bitblas format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+```
+
+## Read gptq format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+```
diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index fc499e7692d98..e356b99d85cdf 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza
 
 ## Read quantized checkpoint
 
+For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
 ```
 
 ## Inflight quantization: load as 4bit quantization
 
+For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
@@ -40,7 +43,7 @@ quantization="bitsandbytes")
 
 ## OpenAI Compatible Server
 
-Append the following to your 4bit model arguments:
+Append the following to your model arguments for 4bit inflight quantization:
 
 ```console
 --quantization bitsandbytes
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 4b1ff4a22a23b..e93e4dcd3b578 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::
 
-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
 
 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 65f438f599f19..c7c8aeb662a56 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -11,10 +11,13 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 supported_hardware
 auto_awq
 bnb
+bitblas
 gguf
 gptqmodel
 int4
 int8
 fp8
+quark
 quantized_kvcache
+torchao
 :::
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
new file mode 100644
index 0000000000000..935ee37a815ff
--- /dev/null
+++ b/docs/source/features/quantization/quark.md
@@ -0,0 +1,217 @@
+(quark)=
+
+# AMD QUARK
+
+Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
+throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
+the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
+activation and kv-cache quantization and cutting-edge quantization algorithms like
+AWQ, GPTQ, Rotation and SmoothQuant.
+
+## Quark Installation
+
+Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
+
+```console
+pip install amd-quark
+```
+
+You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
+for more installation details.
+
+## Quantization Process
+
+After installing Quark, we will use an example to illustrate how to use Quark.  
+The Quark quantization process can be listed for 5 steps as below:
+
+1. Load the model
+2. Prepare the calibration dataloader
+3. Set the quantization configuration
+4. Quantize the model and export
+5. Evaluation in vLLM
+
+### 1. Load the Model
+
+Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
+to fetch model and tokenizer.
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+MAX_SEQ_LEN = 512
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+model.eval()
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+tokenizer.pad_token = tokenizer.eos_token
+```
+
+### 2. Prepare the Calibration Dataloader
+
+Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
+to load calibration data. For more details about how to use calibration datasets efficiently, please refer
+to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
+
+```python
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+
+BATCH_SIZE = 1
+NUM_CALIBRATION_DATA = 512
+
+# Load the dataset and get calibration data.
+dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+    batch_size=BATCH_SIZE, drop_last=True)
+```
+
+### 3. Set the Quantization Configuration
+
+We need to set the quantization configuration, you can check
+[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
+for further details. Here we use FP8 per-tensor quantization on weight, activation,
+kv-cache and the quantization algorithm is AutoSmoothQuant.
+
+:::{note}
+Note the quantization algorithm needs a JSON config file and the config file is located in
+[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+AutoSmoothQuant config file for Llama is
+`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
+:::
+
+```python
+from quark.torch.quantization import (Config, QuantizationConfig,
+                                     FP8E4M3PerTensorSpec,
+                                     load_quant_algo_config_from_file)
+
+# Define fp8/per-tensor/static spec.
+FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    is_dynamic=False).to_quantization_spec()
+
+# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    weight=FP8_PER_TENSOR_SPEC)
+
+# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+kv_cache_quant_config = {name :
+    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                       weight=global_quant_config.weight,
+                       output_tensors=KV_CACHE_SPEC)
+    for name in kv_cache_layer_names_for_llama}
+layer_quant_config = kv_cache_quant_config.copy()
+
+# Define algorithm config by config file.
+LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+EXCLUDE_LAYERS = ["lm_head"]
+quant_config = Config(
+    global_quant_config=global_quant_config,
+    layer_quant_config=layer_quant_config,
+    kv_cache_quant_config=kv_cache_quant_config,
+    exclude=EXCLUDE_LAYERS,
+    algo_config=algo_config)
+```
+
+### 4. Quantize the Model and Export
+
+Then we can apply the quantization. After quantizing, we need to freeze the
+quantized model first before exporting. Note that we need to export model with format of
+HuggingFace `safetensors`, you can refer to
+[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
+for more exporting format details.
+
+```python
+import torch
+from quark.torch import ModelQuantizer, ModelExporter
+from quark.torch.export import ExporterConfig, JsonExporterConfig
+
+# Apply quantization.
+quantizer = ModelQuantizer(quant_config)
+quant_model = quantizer.quantize_model(model, calib_dataloader)
+
+# Freeze quantized model to export.
+freezed_model = quantizer.freeze(model)
+
+# Define export config.
+LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+with torch.no_grad():
+    exporter.export_safetensors_model(freezed_model,
+        quant_config=quant_config, tokenizer=tokenizer)
+```
+
+### 5. Evaluation in vLLM
+
+Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+          kv_cache_dtype='fp8',quantization='quark')
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
+```
+
+Or, you can use `lm_eval` to evaluate accuracy:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
+  --tasks gsm8k
+```
+
+## Quark Quantization Script
+In addition to the example of Python API above, Quark also offers a
+[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
+to quantize large language models more conveniently. It supports quantizing models with variety
+of different quantization schemes and optimization algorithms. It can export the quantized model
+and run evaluation tasks on the fly. With the script, the example above can be:
+
+```console
+python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
+                          --output_dir /path/to/output \
+                          --quant_scheme w_fp8_a_fp8 \
+                          --kv_cache_dtype fp8 \
+                          --quant_algo autosmoothquant \
+                          --num_calib_data 512 \
+                          --model_export hf_format \
+                          --tasks gsm8k
+```
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index a5bd8caf77cd0..984e6626e2417 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -62,7 +62,7 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ✅︎
   * ❌
-  * ❌
+  * ✅︎
 - * FP8 (W8A8)
   * ❌
   * ❌
@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ❌
   * ❌
+- * BitBLAS (GPTQ)
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * AQLM
   * ✅︎
   * ✅︎
diff --git a/docs/source/features/quantization/torchao.md b/docs/source/features/quantization/torchao.md
new file mode 100644
index 0000000000000..9a85f0bab9ec7
--- /dev/null
+++ b/docs/source/features/quantization/torchao.md
@@ -0,0 +1,34 @@
+# TorchAO
+
+TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+
+We recommend installing the latest torchao nightly with
+
+```console
+# Install the latest TorchAO nightly build
+# Choose the CUDA version that matches your system (cu126, cu128, etc.)
+pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
+```
+
+## Quantizing HuggingFace Models
+You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
+
+```Python
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8WeightOnlyConfig
+
+model_name = "meta-llama/Meta-Llama-3-8B"
+quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+hub_repo = # YOUR HUB REPO ID
+tokenizer.push_to_hub(hub_repo)
+quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+```
+
+Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
+See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 0b170aadc3443..3a0be69f8e1c6 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -4,7 +4,7 @@
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
-Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
 
 ## Supported Models
 
@@ -14,6 +14,9 @@ vLLM currently supports the following reasoning models:
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+
+- IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
 
 ## Quickstart
 
@@ -43,6 +46,7 @@ model = models.data[0].id
 
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 response = client.chat.completions.create(model=model, messages=messages)
 
 reasoning_content = response.choices[0].message.reasoning_content
@@ -97,6 +101,7 @@ models = client.models.list()
 model = models.data[0].id
 
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 stream = client.chat.completions.create(model=model,
                                         messages=messages,
                                         stream=True)
@@ -131,7 +136,14 @@ Remember to check whether the `reasoning_content` exists in the response before
 
 ## Structured output
 
-The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
+
+```bash
+VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
 
 ```python
 from openai import OpenAI
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 3e1f1d5be7523..f16e0d96522da 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
 ```
 
 :::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
 :::
 
 Then use a client:
diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 2e1081bf8d14b..57888e122969d 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -1,6 +1,6 @@
 # Tool Calling
 
-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
+vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.
 
 ## Quickstart
 
@@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
+## Required Function Calling
+
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
+
+When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
+
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
@@ -146,12 +152,13 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
 
 Supported models:
 
-* `meta-llama/Meta-Llama-3.1-8B-Instruct`
-* `meta-llama/Meta-Llama-3.1-70B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+All Llama 3.1 and 3.2 models should be supported.
+
+* `meta-llama/Llama-3.1-*`
+* `meta-llama/Llama-3.2-*`
+
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
 
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
@@ -160,10 +167,14 @@ Known issues:
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
 
-The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
-it works better with vLLM.
+VLLM provides two JSON based chat templates for Llama 3.1 and 3.2:
 
-Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+* `examples/tool_chat_template_llama3.1_json.jinja` - this is the "official" chat template for the Llama 3.1
+models, but tweaked so that it works better with vLLM.
+* `examples/tool_chat_template_llama3.2_json.jinja` - this extends upon the Llama 3.1 chat template by adding support for
+images.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 
 #### IBM Granite
 
@@ -239,6 +250,8 @@ Example supported models:
 * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
 * `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
 
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 1206d5fe75390..f77dbefb0a018 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
         "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
+        "mae": "MAE",
         "tpu": "TPU",
         "aqlm": "AQLM",
         "gguf": "GGUF",
@@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
         "rlhf": "RLHF",
         "vllm": "vLLM",
         "openai": "OpenAI",
+        "lmcache": "LMCache",
         "multilora": "MultiLoRA",
         "mlpspeculator": "MLPSpeculator",
         r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index e91ed6fbd7a88..78938de317c48 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -13,11 +13,11 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 - Intel Gaudi accelerator
 - Intel Gaudi software version 1.18.0
 
-Please follow the instructions provided in the [Gaudi Installation
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+Please follow the instructions provided in the
+[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
 to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the [Optimizing Training Platform
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 ## Configure a new environment
 
@@ -32,15 +32,13 @@ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloade
 pip list | grep neural # verify that neural_compressor is installed
 ```
 
-Refer to [Intel Gaudi Software Stack
-Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 
 ### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
-documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
 for more details.
 
 Use the following commands to run a Docker image:
@@ -86,7 +84,7 @@ Currently, there are no pre-built Intel Gaudi images.
 ### Build image from source
 
 ```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
@@ -278,8 +276,9 @@ Lower value corresponds to less usable graph memory reserved for prefill stage,
 :::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
@@ -326,8 +325,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - We recommend running inference on Gaudi 2 with `block_size` of 128
   for BF16 data type. Using default values (16, 32) might lead to
   sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi
-  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
 - For max throughput on Llama 7B, we recommend running with batch size
   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
   If you encounter out-of-memory issues, see troubleshooting section.
@@ -336,11 +334,11 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 **Diagnostic and profiling knobs:**
 
-- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
 
 **Performance tuning knobs:**
 
@@ -381,7 +379,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 
 ## Troubleshooting: tweaking HPU graphs
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 4c668a8e68927..b4bfb696faa28 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.
 
 See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
 
-Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6c7bbf6024992..beb803cf05978 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o
 
 ### Build image from source
 
-You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
+You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 ```
 
 Run the Docker image with the following command:
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 1b2ffd6199945..2c0ec60d7100f 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -159,26 +159,45 @@ Currently, there are no pre-built CPU wheels.
 
 ### Pre-built images
 
-Currently, there are no pre-build CPU images.
+:::::{tab-set}
+:sync-group: device
+
+::::{tab-item} Intel/AMD x86
+:sync: x86
+
+:::{include} cpu/x86.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+:::::
 
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
+$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+
+# Launching OpenAI server 
+$ docker run --rm \
+             --privileged=true \
+             --shm-size=4g \
+             -p 8000:8000 \
+             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+             vllm-cpu-env \
+             --model=meta-llama/Llama-3.2-1B-Instruct \
+             --dtype=bfloat16 \
+             other vLLM OpenAI server arguments
 ```
 
 ::::{tip}
-For ARM or Apple silicon, use `Dockerfile.arm`
+For ARM or Apple silicon, use `docker/Dockerfile.arm`
 ::::
 
 ::::{tip}
-For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 ::::
 
 ## Supported features
@@ -253,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
 
-  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
+  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
+
+  - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 39d9dfbd2b2e2..f385f3d5b1984 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -2,7 +2,7 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as
 
 ```console
 sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
@@ -26,3 +26,9 @@ Finally, build and install vLLM CPU backend:
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
+
+If you want to develop vllm, install it in editable mode instead.
+
+```console
+VLLM_TARGET_DEVICE=cpu python setup.py develop
+```
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index b2f3bafb4e511..9ae2035db5433 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -34,6 +34,8 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 ### Pre-built images
 
+See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
 ### Build image from source
 
 ## Extra information
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index d3e375aec10cb..cd39d6376fe3c 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -46,7 +46,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ##### Install the latest code using `pip`
 
 ```console
-pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
@@ -65,9 +65,11 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 
 ```console
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
+##### Install specific revisions using `uv`
+
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 4381cef5e96a3..21c8d7d01adeb 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
 
 ## Requirements
 
-- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
 - ROCm 6.3
 
 ## Set up using Python
@@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
     ```console
     # Install PyTorch
     $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
     ```
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
@@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
 
 #### (Optional) Build an image with ROCm software stack
 
-Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
 
@@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
 ```
 
 #### Build an image with vLLM
 
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```console
@@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
 - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
 - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 84a9b387789c7..fbf5421eeec5b 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -23,6 +23,8 @@ Currently, there are no pre-built XPU wheels.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
 pip install --upgrade pip
 pip install -v -r requirements/xpu.txt
 ```
@@ -54,7 +56,7 @@ Currently, there are no pre-built XPU images.
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 $ docker run -it \
              --rm \
              --network=host \
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 6ea44c36db324..a03d35030fe8a 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-You can create a new Python environment using `conda`:
+You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
 
 ```console
 # (Recommended) Create a new conda environment.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index b5246c41883ea..25189b006c260 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 
 ```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
 ```
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index fdfaf9f932698..a4744827f2268 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,15 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Out of memory
 
-If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption.
+
+## Generation quality changed
+
+In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+
+In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
+
+You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on <https://huggingface.co> to update their default `generation_config.json` so that it produces better quality generations.
 
 ## Enable more logging
 
diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index b1c2807657ffa..a87484c3bb042 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -47,9 +47,9 @@ This living user guide outlines a few known **important changes and limitations*
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
 | **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
-| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
 | **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
@@ -134,8 +134,6 @@ in progress.
 
 #### Features to Be Supported
 
-- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
-
 - **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
   supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
   Details about the structured outputs can be found
@@ -158,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
   cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
 
 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
-
-## Frequently Asked Questions
-
-**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
-The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
-
-On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
diff --git a/docs/source/index.md b/docs/source/index.md
index 1624d5cf5aae7..28dc0f67d7746 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
 - Multi-lora support
 
@@ -77,9 +77,9 @@ getting_started/v1_user_guide
 :caption: Models
 :maxdepth: 1
 
+models/supported_models
 models/generative_models
 models/pooling_models
-models/supported_models
 models/extensions/index
 :::
 
diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/source/models/extensions/fastsafetensor.md
index 66cd710c97e9f..531d58690014e 100644
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@@ -1,5 +1,5 @@
 Loading Model weights with fastsafetensors
 ===================================================================
 
-Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
 For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 830c579d91bae..cd94c81e620a2 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
 
 :::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index c94e940b8534c..63fc53b0e7c55 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc
 except that tokenization and detokenization are also performed automatically.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 outputs = llm.generate("Hello, my name is")
 
@@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp
 For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
+from vllm import LLM, SamplingParams
+
 llm = LLM(model="facebook/opt-125m")
 params = SamplingParams(temperature=0)
 outputs = llm.generate("Hello, my name is", params)
@@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 conversation = [
     {
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index f774f3d0fa0ed..5f1c2b5b4a3ba 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
 It returns the extracted hidden states directly, which is useful for reward models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
 (output,) = llm.encode("Hello, my name is")
 
@@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
 It is primarily designed for embedding models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
 (output,) = llm.embed("Hello, my name is")
 
@@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro
 It is primarily designed for classification models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
 (output,) = llm.classify("Hello, my name is")
 
@@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
 (output,) = llm.score("What is the capital of France?",
                       "The capital of Brazil is Brasilia.")
@@ -133,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
 - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
+
+## Matryoshka Embeddings
+
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+
+:::{warning}
+Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
+
+For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
+
+```json
+{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
+```
+
+:::
+
+### Manually enable Matryoshka Embeddings
+
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
+
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
+
+Here is an example to serve a model with Matryoshka Embeddings enabled.
+
+```text
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
+```
+
+### Offline Inference
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`.
+
+```python
+from vllm import LLM, PoolingParams
+
+model = LLM(model="jinaai/jina-embeddings-v3", 
+            task="embed", 
+            trust_remote_code=True)
+outputs = model.embed(["Follow the white rabbit."], 
+                      pooling_params=PoolingParams(dimensions=32))
+print(outputs[0].outputs)
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
+
+### Online Inference
+
+Use the following command to start vllm server.
+
+```text
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
+
+```text
+curl http://127.0.0.1:8000/v1/embeddings \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "input": "Follow the white rabbit.",
+    "model": "jinaai/jina-embeddings-v3",
+    "encoding_format": "float",
+    "dimensions": 1
+  }'
+```
+
+Expected output:
+
+```json
+{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
+```
+
+A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 56ea8c5d8372b..0fdffbeefd031 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,66 +1,39 @@
 (supported-models)=
 
-# List of Supported Models
+# Supported Models
 
-vLLM supports generative and pooling models across various tasks.
+vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-## Loading a Model
+## Model Implementation
 
-### HuggingFace Hub
+### vLLM
 
-By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
 
-To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
 
-Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+(transformers-backend)=
 
-:::{tip}
-The easiest way to check if your model is really supported at runtime is to run the program below:
+### Transformers
+
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the modeling backend is Transformers, you can simply do this:
 
 ```python
 from vllm import LLM
-
-# For generative models (task=generate) only
-llm = LLM(model=..., task="generate")  # Name or path of your model
-output = llm.generate("Hello, my name is")
-print(output)
-
-# For pooling models (task={embed,classify,reward,score}) only
-llm = LLM(model=..., task="embed")  # Name or path of your model
-output = llm.encode("Hello, my name is")
-print(output)
-```
-
-If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-:::
-
-Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
-Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
-
-(transformers-fallback)=
-
-### Transformers fallback
-
-vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
-
-To check if the backend is Transformers, you can simply do this:
-
-```python 
-from vllm import LLM
 llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersModel` then it means it's based on Transformers!
+If it is `TransformersForCausalLM` then it means it's based on Transformers!
 
 :::{tip}
-You can force the use of `TransformersModel` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
+You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
 :::
 
 :::{note}
@@ -69,27 +42,30 @@ vLLM may not fully optimise the Transformers implementation so you may see degra
 
 #### Supported features
 
-The Transformers fallback explicitly supports the following features:
+The Transformers modeling backend explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
-- <project:#distributed-serving> (requires `transformers>=4.49.0`)
+- <project:#distributed-serving>
 
-#### Remote code
+#### Remote Code
 
-Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
-If you are interested in this feature, this section is for you!
+If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
 
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
 
-```python 
+:::{tip}
+If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
+:::
+
+```python
 from vllm import LLM
 llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
 llm.apply_model(lambda model: print(model.__class__))
 ```
 
-To make your model compatible with the Transformers fallback, it needs:
+To make your model compatible with the Transformers backend, it needs:
 
 ```{code-block} python
 :caption: modeling_my_model.py
@@ -119,9 +95,11 @@ Here is what happens in the background:
 
 1. The config is loaded
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersModel` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
 
-To make your model compatible with tensor parallel, it needs:
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
 
 ```{code-block} python
 :caption: configuration_my_model.py
@@ -130,20 +108,94 @@ from transformers import PretrainedConfig
 
 class MyConfig(PretrainedConfig):
   base_model_tp_plan = {
-    "layers.*.self_attn.q_proj": "colwise",
-    ...
+    "layers.*.self_attn.k_proj": "colwise",
+    "layers.*.self_attn.v_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+  }
+  base_model_pp_plan = {
+    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+    "norm": (["hidden_states"], ["hidden_states"]),
   }
 ```
 
+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+  * You only need to do this for layers which are not present on all pipeline stages
+  * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+  * The `list` in the first element of the `tuple` contains the names of the input arguments
+  * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
 :::{tip}
-`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+The easiest way to check if your model is really supported at runtime is to run the program below:
+
+```python
+from vllm import LLM
+
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 :::
 
-That's it!
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
+
+#### Using a proxy
+
+Here are some tips for loading/downloading models from Hugging Face using a proxy:
+
+- Set the proxy globally for your session (or set it in the profile file):
+
+```shell
+export http_proxy=http://your.proxy.server:port
+export https_proxy=http://your.proxy.server:port
+```
+
+- Set the proxy for just the current command:
+
+```shell
+https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
+
+# or use vllm cmd directly
+https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
+```
+
+- Set the proxy in Python interpreter:
+
+```python
+import os
+
+os.environ['http_proxy'] = 'http://your.proxy.server:port'
+os.environ['https_proxy'] = 'http://your.proxy.server:port'
+```
 
 ### ModelScope
 
-To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
 
 ```shell
 export VLLM_USE_MODELSCOPE=True
@@ -165,6 +217,8 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```
 
+(supported-text-models)=
+
 ## List of Text-only Language Models
 
 ### Generative Models
@@ -197,6 +251,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
   * ✅︎
   * ✅︎
+- * `BambaForCausalLM`
+  * Bamba
+  * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
+  *
+  *
 - * `BloomForCausalLM`
   * BLOOM, BLOOMZ, BLOOMChat
   * `bigscience/bloom`, `bigscience/bloomz`, etc.
@@ -207,9 +266,9 @@ See [this page](#generative-models) for more information on how to use generativ
   * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
   *
   *
-- * `ChatGLMModel`
+- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
   * ChatGLM
-  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
   * ✅︎
   * ✅︎
 - * `CohereForCausalLM`, `Cohere2ForCausalLM`
@@ -224,7 +283,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `DeciLMForCausalLM`
   * DeciLM
-  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
   *
   * ✅︎
 - * `DeepseekForCausalLM`
@@ -277,6 +336,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `THUDM/glm-4-9b-chat-hf`, etc.
   * ✅︎
   * ✅︎
+- * `Glm4ForCausalLM`
+  * GLM-4-0414
+  * `THUDM/GLM-4-32B-0414`, etc.
+  * ✅︎
+  * ✅︎
 - * `GPT2LMHeadModel`
   * GPT-2
   * `gpt2`, `gpt2-xl`, etc.
@@ -437,6 +501,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
   *
   * ✅︎
+- * `Plamo2ForCausalLM`
+  * PLaMo2
+  * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.
+  *
+  *
 - * `QWenLMHeadModel`
   * Qwen
   * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
@@ -452,6 +521,16 @@ See [this page](#generative-models) for more information on how to use generativ
   * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
   *
   * ✅︎
+- * `Qwen3ForCausalLM`
+  * Qwen3
+  * `Qwen/Qwen3-8B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen3MoeForCausalLM`
+  * Qwen3MoE
+  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
+  * ✅︎
+  * ✅︎
 - * `StableLmForCausalLM`
   * StableLM
   * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
@@ -482,6 +561,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
   * ✅︎
   * ✅︎
+- * `MiniMaxText01ForCausalLM`
+  * MiniMax-Text
+  * `MiniMaxAI/MiniMax-Text-01`, etc.
+  *
+  * ✅︎
 - * `Zamba2ForCausalLM`
   * Zamba2
   * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
@@ -545,7 +629,7 @@ you should explicitly specify the task type to ensure that the model is used in
   *
 - * `XLMRobertaModel`
   * XLM-RoBERTa-based
-  * `intfloat/multilingual-e5-large`, etc.
+  * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
   *
   *
 :::
@@ -660,6 +744,11 @@ If your model is not in the above list, we will try to automatically convert the
   * `BAAI/bge-reranker-v2-m3`, etc.
   *
   *
+- * `ModernBertForSequenceClassification`
+  * ModernBert-based
+  * `Alibaba-NLP/gte-reranker-modernbert-base`, etc.
+  *
+  *
 :::
 
 (supported-mm-models)=
@@ -684,7 +773,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
 :::{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
@@ -699,9 +788,11 @@ llm = LLM(
 Online serving:
 
 ```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
 ```
 
+**This is no longer required if you are using vLLM V1.**
+
 :::
 
 :::{note}
@@ -732,6 +823,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `AyaVisionForConditionalGeneration`
+  * Aya Vision
+  * T + I<sup>+</sup>
+  * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Blip2ForConditionalGeneration`
   * BLIP-2
   * T + I<sup>E</sup>
@@ -796,9 +894,23 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
 - * `InternVLChatModel`
-  * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
   * T + I<sup>E+</sup>
-  * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `KimiVLForConditionalGeneration`
+  * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
+  * T + I<sup>+</sup>
+  * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
+  *
+  *
+  * ✅︎
+- * `Llama4ForConditionalGeneration`
+  * Llama 4
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
   *
   * ✅︎
   * ✅︎
@@ -836,14 +948,21 @@ See [this page](#generative-models) for more information on how to use generativ
   * `openbmb/MiniCPM-o-2_6`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `MiniCPMV`
   * MiniCPM-V
   * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
   * ✅︎
   * ✅︎
+  * ✅︎
+- * `Mistral3ForConditionalGeneration`
+  * Mistral3
+  * T + I<sup>+</sup>
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
   *
+  * ✅︎
+  * ✅︎
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
@@ -853,7 +972,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
 - * `MolmoForCausalLM`
   * Molmo
-  * T + I
+  * T + I<sup>+</sup>
   * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
   * ✅︎
   * ✅︎
@@ -885,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `microsoft/Phi-4-multimodal-instruct`, etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
@@ -921,6 +1040,27 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Qwen2_5OmniThinkerForConditionalGeneration`
+  * Qwen2.5-Omni
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>
+  * `Qwen/Qwen2.5-Omni-7B`
+  *
+  * ✅︎
+  * ✅︎\*
+- * `SkyworkR1VChatModel`
+  * Skywork-R1V-38B
+  * T + I
+  * `Skywork/Skywork-R1V-38B`
+  *
+  * ✅︎
+  * ✅︎
+- * `SmolVLMForConditionalGeneration`
+  * SmolVLM2
+  * T + I
+  * `SmolVLM2-2.2B-Instruct`
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
@@ -937,9 +1077,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
-To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers`.
-
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
@@ -979,6 +1116,14 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 :::
 
+:::{note}
+To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers.git`.
+
+Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+`--mm-processor-kwargs '{"use_audio_in_video": True}'`.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
@@ -1059,7 +1204,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
     :::{tip}
-    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
     :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
@@ -1076,5 +1221,5 @@ We have the following levels of testing for models:
 
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 5b0f8421a51eb..ccbe8a367061f 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to
 You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
 # Set max_num_batched_tokens to tune performance.
 # NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 2e2016c95e4fc..d9a093e8d145d 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
 # Refer to the HuggingFace repo for the correct format to use
@@ -65,6 +67,8 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
 ```python
+from vllm import LLM
+
 llm = LLM(
     model="microsoft/Phi-3.5-vision-instruct",
     trust_remote_code=True,  # Required to load Phi-3.5-vision
@@ -96,6 +100,8 @@ Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ```python
+from vllm import LLM
+
 # Specify the maximum number of frames per video to be 4. This can be changed.
 llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
 
@@ -139,6 +145,8 @@ To input pre-computed embeddings belonging to a data type (i.e. image, video, or
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
+from vllm import LLM
+
 # Inference with image embeddings as input
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
@@ -220,7 +228,7 @@ First, launch the OpenAI-compatible server:
 
 ```bash
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 7bf1c08828dba..894878ed14e76 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -11,6 +11,8 @@ For example, the following code downloads the [`facebook/opt-125m`](https://hugg
 and runs it in vLLM using the default configuration.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 ```
 
@@ -26,6 +28,8 @@ Please refer to the above pages for more details about each API.
 [API Reference](/api/offline_inference/index)
 :::
 
+(configuration-options)=
+
 ## Configuration Options
 
 This section lists the most common options for running the vLLM engine.
@@ -47,6 +51,8 @@ To fix this, explicitly specify the model architecture by passing `config.json`
 For example:
 
 ```python
+from vllm import LLM
+
 model = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
@@ -55,6 +61,8 @@ model = LLM(
 
 Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
 
+(reducing-memory-usage)=
+
 ### Reducing memory usage
 
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
@@ -77,6 +85,12 @@ before initializing vLLM. Otherwise, you may run into an error like `RuntimeErro
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
 :::
 
+:::{note}
+With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
+
+You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+:::
+
 #### Quantization
 
 Quantized models take less memory at the cost of lower precision.
@@ -92,11 +106,46 @@ You can further reduce memory usage by limiting the context length of the model
 and the maximum batch size (`max_num_seqs` option).
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="adept/fuyu-8b",
           max_model_len=2048,
           max_num_seqs=2)
 ```
 
+#### Reduce CUDA Graphs
+
+By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
+
+:::{important}
+CUDA graph capture takes up more memory in V1 than in V0.
+:::
+
+You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        # By default, it goes up to max_num_seqs
+        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+    ),
+)
+```
+
+You can disable graph capturing completely via the `enforce_eager` flag:
+
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True)
+```
+
 #### Adjust cache size
 
 If you run out of CPU RAM, try the following options:
@@ -104,6 +153,62 @@ If you run out of CPU RAM, try the following options:
 - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
+#### Multi-modal input limits
+
+You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
+
+```python
+from vllm import LLM
+
+# Accept up to 3 images and 1 video per prompt
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
+```
+
+You can go a step further and disable unused modalities completely by setting its limit to zero.
+For example, if your application only accepts image input, there is no need to allocate any memory for videos.
+
+```python
+from vllm import LLM
+
+# Accept any number of images but no videos
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"video": 0})
+```
+
+You can even run a multi-modal model for text-only inference:
+
+```python
+from vllm import LLM
+
+# Don't accept images. Just text.
+llm = LLM(model="google/gemma-3-27b-it",
+          limit_mm_per_prompt={"image": 0})
+```
+
+#### Multi-modal processor arguments
+
+For certain models, you can adjust the multi-modal processor arguments to
+reduce the size of the processed multi-modal inputs, which in turn saves memory.
+
+Here are some examples:
+
+```python
+from vllm import LLM
+
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
+
 ### Performance optimization and tuning
 
 You can potentially improve the performance of vLLM by finetuning various options.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1cebff7e1f6e2..34382c87a484b 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -2,15 +2,15 @@
 
 # OpenAI-Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#vllm-serve) command. (You can also use our [Docker](#deployment-docker) image.)
 
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
+To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
 
 ```python
 from openai import OpenAI
@@ -33,11 +33,13 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
+
 :::{important}
 By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
 To disable this behavior, please pass `--generation-config vllm` when launching the server.
 :::
+
 ## Supported APIs
 
 We currently support the following OpenAI APIs:
@@ -172,6 +174,12 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
+:::{tip}
+The vast majority of command-line arguments are based on those for offline inference.
+
+See [here](configuration-options) for some common options.
+:::
+
 :::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
@@ -188,6 +196,7 @@ For example:
 ```yaml
 # config.yaml
 
+model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@@ -196,12 +205,13 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-vllm serve SOME_MODEL --config config.yaml
+vllm serve --config config.yaml
 ```
 
 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::
 
 ## API Reference
@@ -392,9 +402,26 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 :::
 
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
 
-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+#### Extra Parameters
+
+The following [sampling parameters](#sampling-params) are supported.
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-sampling-params
+:end-before: end-transcription-sampling-params
+:::
+
+The following extra parameters are supported:
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-extra-params
+:end-before: end-transcription-extra-params
+:::
 
 (tokenizer-api)=
 
diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index cfc3cb2576873..750cba7ed9ce2 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -1,6 +1,8 @@
 # Usage Stats Collection
 
-vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
+
+A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
 
 ## What data is collected?
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 840892ea07010..e3c75d5cb6a96 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
@@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
@@ -130,6 +130,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join([
+        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@@ -182,59 +212,13 @@ model_example_map = {
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
     "qwen2_audio": run_qwen2_audio,
+    "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
     "whisper": run_whisper,
 }
 
 
-def main(args):
-    model = args.model_type
-    if model not in model_example_map:
-        raise ValueError(f"Model type {model} is not supported.")
-
-    audio_count = args.num_audios
-    req_data = model_example_map[model](question_per_audio_count[audio_count],
-                                        audio_count)
-
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
-    llm = LLM(**engine_args)
-
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
-    # We set temperature to 0.2 so that outputs can be different
-    # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
-
-    mm_data = {}
-    if audio_count > 0:
-        mm_data = {
-            "audio": [
-                asset.audio_and_sample_rate
-                for asset in audio_assets[:audio_count]
-            ]
-        }
-
-    assert args.num_prompts > 0
-    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
-    if args.num_prompts > 1:
-        # Batch inference
-        inputs = [inputs] * args.num_prompts
-
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'audio language models')
@@ -258,5 +242,61 @@ if __name__ == "__main__":
                         default=None,
                         help="Set the seed when initializing `vllm.LLM`.")
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    audio_count = args.num_audios
+    req_data = model_example_map[model](question_per_audio_count[audio_count],
+                                        audio_count)
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
+
+    mm_data = {}
+    if audio_count > 0:
+        mm_data = {
+            "audio": [
+                asset.audio_and_sample_rate
+                for asset in audio_assets[:audio_count]
+            ]
+        }
+
+    assert args.num_prompts > 0
+    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
+    if args.num_prompts > 1:
+        # Batch inference
+        inputs = [inputs] * args.num_prompts
+    # Add LoRA request if applicable
+    lora_request = (req_data.lora_requests *
+                    args.num_prompts if req_data.lora_requests else None)
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params=sampling_params,
+        lora_request=lora_request,
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index a6e96c0bb4339..ae5ae7cb48346 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -12,13 +12,23 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m")
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b2523e533a40a..6857c6e9e31df 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    # Add example params
+    parser.add_argument("--chat-template-path", type=str)
+
+    return parser
+
+
 def main(args: dict):
     # Pop arguments not used by LLM
     max_tokens = args.pop("max_tokens")
@@ -27,12 +45,13 @@ def main(args: dict):
         sampling_params.top_k = top_k
 
     def print_outputs(outputs):
+        print("\nGenerated Outputs:\n" + "-" * 80)
         for output in outputs:
             prompt = output.prompt
             generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}")
+            print(f"Prompt: {prompt!r}\n")
             print(f"Generated text: {generated_text!r}")
-        print("-" * 80)
+            print("-" * 80)
 
     print("=" * 80)
 
@@ -81,18 +100,6 @@ def main(args: dict):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    # Add sampling params
-    sampling_group = parser.add_argument_group("Sampling parameters")
-    sampling_group.add_argument("--max-tokens", type=int)
-    sampling_group.add_argument("--temperature", type=float)
-    sampling_group.add_argument("--top-p", type=float)
-    sampling_group.add_argument("--top-k", type=int)
-    # Add example params
-    parser.add_argument("--chat-template-path", type=str)
+    parser = create_parser()
     args: dict = vars(parser.parse_args())
     main(args)
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 4ef949b4784de..5b6dcb41eee1c 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
+                        task="classify",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -23,20 +33,16 @@ def main(args: Namespace):
     outputs = model.classify(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         probs = output.outputs.probs
         probs_trimmed = ((str(probs[:16])[:-1] +
                           ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
-                        task="classify",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index f1655b6dbe111..cb5f923ffb697 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
+                        task="embed",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -23,20 +33,16 @@ def main(args: Namespace):
     outputs = model.embed(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
         embeds_trimmed = ((str(embeds[:16])[:-1] +
                            ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
-                        task="embed",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 93f4f2a36fac6..54b52b22a45a9 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -4,6 +4,22 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+
+    return parser
+
+
 def main(args: dict):
     # Pop arguments not used by LLM
     max_tokens = args.pop("max_tokens")
@@ -35,23 +51,15 @@ def main(args: dict):
     ]
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
+    print("-" * 50)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    # Add sampling params
-    sampling_group = parser.add_argument_group("Sampling parameters")
-    sampling_group.add_argument("--max-tokens", type=int)
-    sampling_group.add_argument("--temperature", type=float)
-    sampling_group.add_argument("--top-p", type=float)
-    sampling_group.add_argument("--top-k", type=int)
+    parser = create_parser()
     args: dict = vars(parser.parse_args())
     main(args)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 2d21f1f0e3971..d2bda8b3180c3 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
+                        task="score",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     text_1 = "What is the capital of France?"
@@ -22,17 +32,13 @@ def main(args: Namespace):
     outputs = model.score(text_1, texts_2)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for text_2, output in zip(texts_2, outputs):
         score = output.outputs.score
-        print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
+        print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
-                        task="score",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
new file mode 100644
index 0000000000000..6548857b6d111
--- /dev/null
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+
+Ray Data is a data processing framework that can handle large datasets
+and integrates tightly with vLLM for data-parallel inference.
+
+As of Ray 2.44, Ray Data has a native integration with
+vLLM (under ray.data.llm).
+
+Ray Data provides functionality for:
+* Reading and writing to cloud storage (S3, GCS, etc.)
+* Automatic sharding and load-balancing across a cluster
+* Optimized configuration of vLLM using continuous batching
+* Compatible with tensor/pipeline parallel inference as well.
+
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+
+assert Version(ray.__version__) >= Version(
+    "2.44.1"), "Ray version must be at least 2.44.1"
+
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[{
+            "role": "system",
+            "content": "You are a bot that responds with haikus."
+        }, {
+            "role": "user",
+            "content": row["text"]
+        }],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        )),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row  # This will return all the original columns in the dataset.
+    ),
+)
+
+ds = vllm_processor(ds)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
index 8211629b24ecc..37aea281032fd 100644
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@@ -3,9 +3,12 @@
 This file demonstrates the example usage of cpu offloading
 with LMCache.
 
-Note that `pip install lmcache` is needed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
+Note that `lmcache` is needed to run this example.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
 """
+import contextlib
 import os
 import time
 
@@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 
-# LMCache-related environment variables
-# Use experimental features in LMCache
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Enable local CPU backend in LMCache
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
-# Set local CPU memory limit to 5.0 GB
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
 
-# This example script runs two requests with a shared prefix.
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
-]
-second_prompt = [
-    shared_prompt + "Tell me a very long story",
-]
+def setup_environment_variables():
+    # LMCache-related environment variables
+    # Use experimental features in LMCache
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache is set to use 256 tokens per chunk
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    # Enable local CPU backend in LMCache
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
 
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-ktc = KVTransferConfig.from_cli(
-    '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
-# memory. Reduce the value if your GPU has less memory.
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-          kv_transfer_config=ktc,
-          max_model_len=8000,
-          enable_chunked_prefill=False,
-          gpu_memory_utilization=0.8)
+@contextlib.contextmanager
+def build_llm_with_lmcache():
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              enable_chunked_prefill=True,
+              gpu_memory_utilization=0.8)
 
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("First request done.")
+    try:
+        yield llm
+    finally:
+        # Clean up lmcache backend
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
 
-time.sleep(1)
 
-outputs = llm.generate(second_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("Second request done.")
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
+          f"{req_str} request done.")
+    print("-" * 50)
 
-# Clean up lmcache backend
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+def main():
+    setup_environment_variables()
+
+    with build_llm_with_lmcache() as llm:
+
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+
+        sampling_params = SamplingParams(temperature=0,
+                                         top_p=0.95,
+                                         max_tokens=10)
+
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+
+        time.sleep(1)
+
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b73770ce382cf..965915beaf58f 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,26 +1,83 @@
 # SPDX-License-Identifier: Apache-2.0
-# usage:
-# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
-# we need to have a launcher to create multiple data parallel
-# ranks. And each rank will create a vLLM instance to process its own prompts.
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
 import os
+from time import sleep
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
-GPUs_per_dp_rank = 2
-DP_size = 2
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument("--model",
+                        type=str,
+                        default="ibm-research/PowerMoE-3b",
+                        help="Model name or path")
+    parser.add_argument("--dp-size",
+                        type=int,
+                        default=2,
+                        help="Data parallel size")
+    parser.add_argument("--tp-size",
+                        type=int,
+                        default=2,
+                        help="Tensor parallel size")
+    parser.add_argument("--node-size",
+                        type=int,
+                        default=1,
+                        help="Total number of nodes")
+    parser.add_argument("--node-rank",
+                        type=int,
+                        default=0,
+                        help="Rank of the current node")
+    parser.add_argument("--master-addr",
+                        type=str,
+                        default="",
+                        help="Master node IP address")
+    parser.add_argument("--master-port",
+                        type=int,
+                        default=0,
+                        help="Master node port")
+    return parser.parse_args()
 
 
-def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
-    os.environ["VLLM_DP_RANK"] = str(dp_rank)
+def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
+         dp_master_port, GPUs_per_dp_rank):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
     os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
     os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
-    # set devices for each dp_rank
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) *
-                              GPUs_per_dp_rank))
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
 
     # Sample prompts.
     prompts = [
@@ -28,20 +85,20 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
         "The president of the United States is",
         "The capital of France is",
         "The future of AI is",
-    ]
+    ] * 100
 
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
     # and each rank processes a different part of the dataset.
     promts_per_rank = len(prompts) // dp_size
-    start = dp_rank * promts_per_rank
+    start = global_dp_rank * promts_per_rank
     end = start + promts_per_rank
     prompts = prompts[start:end]
     if len(prompts) == 0:
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
         prompts = ["Placeholder"]
-    print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
 
     # Create a sampling params object.
     # since we are doing data parallel, every rank can have different
@@ -49,37 +106,67 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     # ranks for demonstration.
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
-                                     max_tokens=16 * (dp_rank + 1))
+                                     max_tokens=[16, 20][global_dp_rank % 2])
 
     # Create an LLM.
-    llm = LLM(model="ibm-research/PowerMoE-3b",
+    llm = LLM(model=model,
               tensor_parallel_size=GPUs_per_dp_rank,
               enforce_eager=True,
               enable_expert_parallel=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
-    for output in outputs:
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
               f"Generated text: {generated_text!r}")
 
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
 
 if __name__ == "__main__":
+
+    args = parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
     from multiprocessing import Process
-    dp_master_ip = "127.0.0.1"
-    dp_master_port = get_open_port()
+
     procs = []
-    for i in range(DP_size):
+    for local_dp_rank, global_dp_rank in enumerate(
+            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
         proc = Process(target=main,
-                       args=(DP_size, i, dp_master_ip, dp_master_port,
-                             GPUs_per_dp_rank))
+                       args=(args.model, dp_size, local_dp_rank,
+                             global_dp_rank, dp_master_ip, dp_master_port,
+                             tp_size))
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join()
-        if proc.exitcode:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that "
+                  f"didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
             exit_code = proc.exitcode
 
     exit(exit_code)
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
new file mode 100644
index 0000000000000..66efbc0c9deec
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    max_num_batched_tokens=64,
+    max_num_seqs=16,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
new file mode 100644
index 0000000000000..f7cbf6557d54f
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_connector_extra_config": '
+              '{"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh
new file mode 100644
index 0000000000000..0ebf45a1586a0
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -0,0 +1,5 @@
+rm -rf local_storage/
+rm output.txt
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 36ee24bf7f18b..d60985146c5c9 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -95,7 +95,7 @@ def run_decode(prefill_done):
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-if __name__ == "__main__":
+def main():
     prefill_done = Event()
     prefill_process = Process(target=run_prefill, args=(prefill_done, ))
     decode_process = Process(target=run_decode, args=(prefill_done, ))
@@ -109,3 +109,7 @@ if __name__ == "__main__":
     # Terminate the prefill node when decode is finished
     decode_process.join()
     prefill_process.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
index 5c84bbfc92c53..7da6fb7aaa230 100644
--- a/examples/offline_inference/disaggregated_prefill_lmcache.py
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -38,6 +38,10 @@ os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
 # `naive` indicates using raw bytes of the tensor without any compression
 os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
 
+prompts = [
+    "Hello, how are you?" * 1000,
+]
+
 
 def run_prefill(prefill_done, prompts):
     # We use GPU 0 for prefill node.
@@ -106,12 +110,7 @@ def run_lmcache_server(port):
     return server_proc
 
 
-if __name__ == "__main__":
-
-    prompts = [
-        "Hello, how are you?" * 1000,
-    ]
-
+def main():
     prefill_done = Event()
     prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
     decode_process = Process(target=run_decode, args=(prefill_done, prompts))
@@ -128,3 +127,7 @@ if __name__ == "__main__":
     prefill_process.terminate()
     lmcache_server_process.terminate()
     lmcache_server_process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
deleted file mode 100644
index e890c6dad8bd1..0000000000000
--- a/examples/offline_inference/distributed.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example shows how to use Ray Data for running offline batch inference
-distributively on a multi-nodes cluster.
-
-Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
-"""
-
-from typing import Any
-
-import numpy as np
-import ray
-from packaging.version import Version
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-from vllm import LLM, SamplingParams
-
-assert Version(ray.__version__) >= Version(
-    "2.22.0"), "Ray version must be at least 2.22.0"
-
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Set tensor parallelism per instance.
-tensor_parallel_size = 1
-
-# Set number of instances. Each instance will use tensor_parallel_size GPUs.
-num_instances = 1
-
-
-# Create a class to do batch inference.
-class LLMPredictor:
-
-    def __init__(self):
-        # Create an LLM.
-        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                       tensor_parallel_size=tensor_parallel_size)
-
-    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
-        # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain the prompt,
-        # generated text, and other information.
-        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: list[str] = []
-        generated_text: list[str] = []
-        for output in outputs:
-            prompt.append(output.prompt)
-            generated_text.append(' '.join([o.text for o in output.outputs]))
-        return {
-            "prompt": prompt,
-            "generated_text": generated_text,
-        }
-
-
-# Read one text file from S3. Ray Data supports reading multiple files
-# from cloud storage (such as JSONL, Parquet, CSV, binary format).
-ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
-
-
-# For tensor_parallel_size > 1, we need to create placement groups for vLLM
-# to use. Every actor has to have its own placement group.
-def scheduling_strategy_fn():
-    # One bundle per tensor parallel worker
-    pg = ray.util.placement_group(
-        [{
-            "GPU": 1,
-            "CPU": 1
-        }] * tensor_parallel_size,
-        strategy="STRICT_PACK",
-    )
-    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
-        pg, placement_group_capture_child_tasks=True))
-
-
-resources_kwarg: dict[str, Any] = {}
-if tensor_parallel_size == 1:
-    # For tensor_parallel_size == 1, we simply set num_gpus=1.
-    resources_kwarg["num_gpus"] = 1
-else:
-    # Otherwise, we have to set num_gpus=0 and provide
-    # a function that will create a placement group for
-    # each instance.
-    resources_kwarg["num_gpus"] = 0
-    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-
-# Apply batch inference for all input data.
-ds = ds.map_batches(
-    LLMPredictor,
-    # Set the concurrency to the number of LLM instances.
-    concurrency=num_instances,
-    # Specify the batch size for inference.
-    batch_size=32,
-    **resources_kwarg,
-)
-
-# Peek first 10 results.
-# NOTE: This is for local testing and debugging. For production use case,
-# one should write full result out as shown below.
-outputs = ds.take(limit=10)
-for output in outputs:
-    prompt = output["prompt"]
-    generated_text = output["generated_text"]
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-# Write inference output data out as Parquet files to S3.
-# Multiple files would be written to the output destination,
-# and each task would write one or more files separately.
-#
-# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index baa91b2d0364d..c7b4368c9b132 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -7,87 +7,108 @@ from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 
-parser = argparse.ArgumentParser()
 
-parser.add_argument(
-    "--dataset",
-    type=str,
-    default="./examples/data/gsm8k.jsonl",
-    help="downloaded from the eagle repo " \
-    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
-)
-parser.add_argument("--max_num_seqs", type=int, default=8)
-parser.add_argument("--num_prompts", type=int, default=80)
-parser.add_argument("--num_spec_tokens", type=int, default=2)
-parser.add_argument("--tp", type=int, default=1)
-parser.add_argument("--draft_tp", type=int, default=1)
-parser.add_argument("--enforce_eager", action='store_true')
-parser.add_argument("--enable_chunked_prefill", action='store_true')
-parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
-parser.add_argument("--temp", type=float, default=0)
+def load_prompts(dataset_path, num_prompts):
+    if os.path.exists(dataset_path):
+        prompts = []
+        try:
+            with open(dataset_path) as f:
+                for line in f:
+                    data = json.loads(line)
+                    prompts.append(data["turns"][0])
+        except Exception as e:
+            print(f"Error reading dataset: {e}")
+            return []
+    else:
+        prompts = [
+            "The future of AI is", "The president of the United States is"
+        ]
 
-args = parser.parse_args()
+    return prompts[:num_prompts]
 
-print(args)
 
-model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="./examples/data/gsm8k.jsonl",
+        help="downloaded from the eagle repo " \
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+    )
+    parser.add_argument("--max_num_seqs", type=int, default=8)
+    parser.add_argument("--num_prompts", type=int, default=80)
+    parser.add_argument("--num_spec_tokens", type=int, default=2)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--draft_tp", type=int, default=1)
+    parser.add_argument("--enforce_eager", action='store_true')
+    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+    parser.add_argument("--temp", type=float, default=0)
+    return parser.parse_args()
 
-max_model_len = 2048
 
-tokenizer = AutoTokenizer.from_pretrained(model_dir)
+def main():
 
-if os.path.exists(args.dataset):
-    prompts = []
-    num_prompts = args.num_prompts
-    with open(args.dataset) as f:
-        for line in f:
-            data = json.loads(line)
-            prompts.append(data["turns"][0])
-else:
-    prompts = ["The future of AI is", "The president of the United States is"]
+    args = parse_args()
 
-prompts = prompts[:args.num_prompts]
-num_prompts = len(prompts)
+    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
 
-prompt_ids = [
-    tokenizer.apply_chat_template([{
-        "role": "user",
-        "content": prompt
-    }],
-                                  add_generation_prompt=True)
-    for prompt in prompts
-]
+    max_model_len = 2048
 
-llm = LLM(
-    model=model_dir,
-    trust_remote_code=True,
-    tensor_parallel_size=args.tp,
-    enable_chunked_prefill=args.enable_chunked_prefill,
-    max_num_batched_tokens=args.max_num_batched_tokens,
-    enforce_eager=args.enforce_eager,
-    max_model_len=max_model_len,
-    max_num_seqs=args.max_num_seqs,
-    gpu_memory_utilization=0.8,
-    speculative_model=eagle_dir,
-    num_speculative_tokens=args.num_spec_tokens,
-    speculative_draft_tensor_parallel_size=args.draft_tp,
-    speculative_max_model_len=max_model_len,
-    disable_log_stats=False,
-)
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
 
-sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+    prompts = load_prompts(args.dataset, args.num_prompts)
 
-outputs = llm.generate(prompt_token_ids=prompt_ids,
-                       sampling_params=sampling_params)
+    prompt_ids = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      add_generation_prompt=True)
+        for prompt in prompts
+    ]
 
-# calculate the average number of accepted tokens per forward pass, +1 is
-# to account for the token from the target model that's always going to be
-# accepted
-acceptance_counts = [0] * (args.num_spec_tokens + 1)
-for output in outputs:
-    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
-        acceptance_counts[step] += count
+    llm = LLM(
+        model=model_dir,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tp,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        enforce_eager=args.enforce_eager,
+        max_model_len=max_model_len,
+        max_num_seqs=args.max_num_seqs,
+        gpu_memory_utilization=0.8,
+        speculative_config={
+            "method": "eagle",
+            "model": eagle_dir,
+            "num_speculative_tokens": args.num_spec_tokens,
+            "draft_tensor_parallel_size": args.draft_tp,
+            "max_model_len": max_model_len,
+        },
+        disable_log_stats=False,
+    )
 
-print(f"mean acceptance length: \
-    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+    outputs = llm.generate(prompt_token_ids=prompt_ids,
+                           sampling_params=sampling_params)
+
+    # calculate the average number of accepted tokens per forward pass, +1 is
+    # to account for the token from the target model that's always going to be
+    # accepted
+    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+    for output in outputs:
+        for step, count in enumerate(
+                output.metrics.spec_token_acceptance_counts):
+            acceptance_counts[step] += count
+
+    print("-" * 50)
+    print(f"mean acceptance length: \
+        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
new file mode 100644
index 0000000000000..b347ddbf3197a
--- /dev/null
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    # Only text matching task is supported for now. See #16120
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("Only text matching task is supported for now. See #16120")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = ((str(embeds[:16])[:-1] +
+                           ", ...]") if len(embeds) > 16 else embeds)
+        print(f"Prompt: {prompt!r} \n"
+              f"Embeddings for text matching: {embeds_trimmed} "
+              f"(size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
new file mode 100644
index 0000000000000..7a6cb02556d9a
--- /dev/null
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs, PoolingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = ((str(embeds[:16])[:-1] +
+                           ", ...]") if len(embeds) > 16 else embeds)
+        print(f"Prompt: {prompt!r} \n"
+              f"Embeddings: {embeds_trimmed} "
+              f"(size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 8765d1812cc53..c4916e00f473c 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -8,93 +8,112 @@ from vllm import LLM, SamplingParams
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                          TokensPrompt, zip_enc_dec_prompts)
 
-dtype = "float"
 
-# Create a BART encoder/decoder model instance
-llm = LLM(
-    model="facebook/bart-large-cnn",
-    dtype=dtype,
-)
+def create_prompts(tokenizer):
+    # Test prompts
+    #
+    # This section shows all of the valid ways to prompt an
+    # encoder/decoder model.
+    #
+    # - Helpers for building prompts
+    text_prompt_raw = "Hello, my name is"
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
+        prompt="The capital of France is"))
+    # - Pass a single prompt to encoder/decoder model
+    #   (implicitly encoder input prompt);
+    #   decoder input prompt is assumed to be None
 
-# Get BART tokenizer
-tokenizer = llm.llm_engine.get_tokenizer_group()
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+    single_text_prompt = text_prompt  # Pass a TextPrompt
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
 
-# Test prompts
-#
-# This section shows all of the valid ways to prompt an
-# encoder/decoder model.
-#
-# - Helpers for building prompts
-text_prompt_raw = "Hello, my name is"
-text_prompt = TextPrompt(prompt="The president of the United States is")
-tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
-    prompt="The capital of France is"))
-# - Pass a single prompt to encoder/decoder model
-#   (implicitly encoder input prompt);
-#   decoder input prompt is assumed to be None
+    # ruff: noqa: E501
+    # - Pass explicit encoder and decoder input prompts within one data structure.
+    #   Encoder and decoder prompts can both independently be text or tokens, with
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
 
-single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-single_text_prompt = text_prompt  # Pass a TextPrompt
-single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
+        encoder_prompt=single_text_prompt_raw,
+        decoder_prompt=single_tokens_prompt,
+    )
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
+        # pass decoder prompt string directly
+        encoder_prompt=single_text_prompt,
+        decoder_prompt=single_text_prompt_raw,
+    )
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt tokens directly, and
+        # pass TextPrompt to decoder
+        encoder_prompt=single_tokens_prompt,
+        decoder_prompt=single_text_prompt,
+    )
 
-# - Pass explicit encoder and decoder input prompts within one data structure.
-#   Encoder and decoder prompts can both independently be text or tokens, with
-#   no requirement that they be the same prompt type. Some example prompt-type
-#   combinations are shown below, note that these are not exhaustive.
+    # - Finally, here's a useful helper function for zipping encoder and
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    #   instances
+    zipped_prompt_list = zip_enc_dec_prompts(
+        ['An encoder prompt', 'Another encoder prompt'],
+        ['A decoder prompt', 'Another decoder prompt'])
 
-enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
-    # Pass encoder prompt string directly, &
-    # pass decoder prompt tokens
-    encoder_prompt=single_text_prompt_raw,
-    decoder_prompt=single_tokens_prompt,
-)
-enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
-    # Pass TextPrompt to encoder, and
-    # pass decoder prompt string directly
-    encoder_prompt=single_text_prompt,
-    decoder_prompt=single_text_prompt_raw,
-)
-enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
-    # Pass encoder prompt tokens directly, and
-    # pass TextPrompt to decoder
-    encoder_prompt=single_tokens_prompt,
-    decoder_prompt=single_text_prompt,
-)
+    # - Let's put all of the above example prompts together into one list
+    #   which we will pass to the encoder/decoder LLM.
+    return [
+        single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
+        enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+    ] + zipped_prompt_list
 
-# - Finally, here's a useful helper function for zipping encoder and
-#   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
-#   instances
-zipped_prompt_list = zip_enc_dec_prompts(
-    ['An encoder prompt', 'Another encoder prompt'],
-    ['A decoder prompt', 'Another decoder prompt'])
-
-# - Let's put all of the above example prompts together into one list
-#   which we will pass to the encoder/decoder LLM.
-prompts = [
-    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
-    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
-] + zipped_prompt_list
-
-print(prompts)
 
 # Create a sampling params object.
-sampling_params = SamplingParams(
-    temperature=0,
-    top_p=1.0,
-    min_tokens=0,
-    max_tokens=20,
-)
+def create_sampling_params():
+    return SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        min_tokens=0,
+        max_tokens=20,
+    )
 
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
 
 # Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
-    generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+def print_outputs(outputs):
+    print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i+1}:")
+        print(f"Encoder prompt: {encoder_prompt!r}\n"
+              f"Decoder prompt: {prompt!r}\n"
+              f"Generated text: {generated_text!r}")
+        print("-" * 50)
+
+
+def main():
+    dtype = "float"
+
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 6d0c3ac1ee09a..2883c37ca2360 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
 def run_florence2():
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
         max_num_seqs=8,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 1},
@@ -56,7 +56,7 @@ def run_florence2():
 def run_mllama():
     engine_args = EngineArgs(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": 1},
         dtype="half",
@@ -126,6 +126,23 @@ model_example_map = {
 }
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -133,6 +150,11 @@ def main(args):
 
     req_data = model_example_map[model]()
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
@@ -143,6 +165,7 @@ def main(args):
         temperature=0,
         top_p=1.0,
         max_tokens=64,
+        skip_special_tokens=False,
     )
 
     start = time.time()
@@ -166,19 +189,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="mllama",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index e94f47b72b2e9..d84cd9ee9f52b 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
 import argparse
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -26,6 +29,7 @@ def process_requests(engine: LLMEngine,
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
+    print('-' * 50)
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params = test_prompts.pop(0)
@@ -37,6 +41,7 @@ def process_requests(engine: LLMEngine,
         for request_output in request_outputs:
             if request_output.finished:
                 print(request_output)
+                print('-' * 50)
 
 
 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
@@ -45,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
     return LLMEngine.from_engine_args(engine_args)
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
 def main(args: argparse.Namespace):
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine(args)
@@ -53,8 +65,5 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
-    parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
new file mode 100644
index 0000000000000..7e90d5d25e293
--- /dev/null
+++ b/examples/offline_inference/load_sharded_state.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="Hello, world!",
+                        help="Prompt for validation")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=100,
+                        help="Maximum number of tokens to generate")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.7,
+                        help="Sampling temperature")
+    parser.add_argument("--top-p",
+                        type=float,
+                        default=1.0,
+                        help="Top-p sampling parameter")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(f"Loading model from {engine_args.model} "
+          f"using format {engine_args.load_format}")
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 43be2aa80773f..37c3181dc5faf 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams
 # - Server:
 #
 # ```bash
+# # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
+#
+# # HF format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -44,19 +49,23 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py simple
 #     python demo.py advanced
 
+# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+# These scripts have been tested on 2x L40 GPUs
+
 
 def run_simple_demo(args: argparse.Namespace):
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     sampling_params = SamplingParams(max_tokens=8192)
 
-    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
     llm = LLM(
         model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
+        tensor_parallel_size=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -82,23 +91,25 @@ def run_simple_demo(args: argparse.Namespace):
         },
     ]
     outputs = llm.chat(messages, sampling_params=sampling_params)
-
+    print("-" * 50)
     print(outputs[0].outputs[0].text)
+    print("-" * 50)
 
 
 def run_advanced_demo(args: argparse.Namespace):
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-    max_img_per_msg = 5
+    max_img_per_msg = 3
     max_tokens_per_img = 4096
 
     sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
     llm = LLM(
         model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
         limit_mm_per_prompt={"image": max_img_per_msg},
         max_model_len=max_img_per_msg * max_tokens_per_img,
+        tensor_parallel_size=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -153,10 +164,12 @@ def run_advanced_demo(args: argparse.Namespace):
     ]
 
     outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print("-" * 50)
     print(outputs[0].outputs[0].text)
+    print("-" * 50)
 
 
-def main():
+def parse_args():
     parser = argparse.ArgumentParser(
         description="Run a demo in simple or advanced mode.")
 
@@ -166,12 +179,20 @@ def main():
         help="Specify the demo mode: 'simple' or 'advanced'",
     )
 
+    parser.add_argument('--format',
+                        choices=["mistral", "hf"],
+                        default="mistral",
+                        help='Specify the format of the model to load.')
+
     parser.add_argument(
         '--disable-mm-preprocessor-cache',
         action='store_true',
         help='If True, disables caching of multi-modal preprocessor/mapper.')
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main():
+    args = parse_args()
 
     if args.mode == "simple":
         print("Running simple demo...")
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 380c53fab2201..53c58a76d9dc1 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,4 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the usage of text generation with an LLM model,
+comparing the performance with and without speculative decoding.
+
+Note that still not support `v1`:
+VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+"""
 
 import gc
 import time
@@ -7,7 +14,7 @@ from vllm import LLM, SamplingParams
 
 
 def time_generation(llm: LLM, prompts: list[str],
-                    sampling_params: SamplingParams):
+                    sampling_params: SamplingParams, title: str):
     # Generate texts from the prompts. The output is a list of RequestOutput
     # objects that contain the prompt, generated text, and other information.
     # Warmup first
@@ -16,15 +23,18 @@ def time_generation(llm: LLM, prompts: list[str],
     start = time.time()
     outputs = llm.generate(prompts, sampling_params)
     end = time.time()
-    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    print("-" * 50)
+    print(title)
+    print("time: ",
+          (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
     # Print the outputs.
     for output in outputs:
         generated_text = output.outputs[0].text
         print(f"text: {generated_text!r}")
+        print("-" * 50)
 
 
-if __name__ == "__main__":
-
+def main():
     template = (
         "Below is an instruction that describes a task. Write a response "
         "that appropriately completes the request.\n\n### Instruction:\n{}"
@@ -41,8 +51,7 @@ if __name__ == "__main__":
     # Create an LLM without spec decoding
     llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
 
-    print("Without speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "Without speculation")
 
     del llm
     gc.collect()
@@ -55,5 +64,8 @@ if __name__ == "__main__":
         },
     )
 
-    print("With speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "With speculation")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 4b0d115e6609c..de409740292a8 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine,
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
+    print("-" * 50)
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params, lora_request = test_prompts.pop(0)
@@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine,
         for request_output in request_outputs:
             if request_output.finished:
                 print(request_output)
+                print("-" * 50)
 
 
 def initialize_engine() -> LLMEngine:
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index 517d1bfce95d8..5906c7b2c6b30 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -12,27 +12,36 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in transformers-neuronx.
-    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=1024,
-    block_size=1024,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=1024,
+        block_size=1024,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index c899a01a0bb93..af21274a3a5b8 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -22,31 +22,40 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in transformers-neuronx.
-    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    quantization="neuron_quant",
-    override_neuron_config={
-        "cast_logits_dtype": "bfloat16",
-    },
-    tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=2048,
+        block_size=2048,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        quantization="neuron_quant",
+        override_neuron_config={
+            "cast_logits_dtype": "bfloat16",
+        },
+        tensor_parallel_size=2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index 4c326c417b4db..f0bec387d3a9b 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
 
-# Create an LLM without prefix caching as a baseline.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 
-print("Results without `enable_prefix_caching`")
+def main():
+    # Create an LLM without prefix caching as a baseline.
+    regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = regular_llm.generate(generating_prompts, sampling_params)
+    print("Results without `enable_prefix_caching`")
 
-regular_generated_texts = []
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    regular_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # ruff: noqa: E501
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = regular_llm.generate(generating_prompts, sampling_params)
 
-print("-" * 80)
+    regular_generated_texts = []
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        regular_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
-# Destroy the LLM object and free up the GPU memory.
-del regular_llm
-cleanup_dist_env_and_memory()
+    # Destroy the LLM object and free up the GPU memory.
+    del regular_llm
+    cleanup_dist_env_and_memory()
 
-# Create an LLM with prefix caching enabled.
-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
+    # Create an LLM with prefix caching enabled.
+    prefix_cached_llm = LLM(model="facebook/opt-125m",
+                            enable_prefix_caching=True,
+                            gpu_memory_utilization=0.4)
 
-# Warmup so that the shared prompt's KV cache is computed.
-prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+    # Warmup so that the shared prompt's KV cache is computed.
+    prefix_cached_llm.generate(generating_prompts[0], sampling_params)
 
-# Generate with prefix caching.
-outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 
-print("Results with `enable_prefix_caching`")
+    print("Results with `enable_prefix_caching`")
 
-cached_generated_texts = []
-# Print the outputs. You should see the same outputs as before.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    cached_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    cached_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        cached_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
-print("-" * 80)
+    # Compare the results and display the speedup
+    generated_same = all([
+        regular_generated_texts[i] == cached_generated_texts[i]
+        for i in range(len(prompts))
+    ])
+    print(f"Generated answers are the same: {generated_same}")
 
-# Compare the results and display the speedup
-generated_same = all([
-    regular_generated_texts[i] == cached_generated_texts[i]
-    for i in range(len(prompts))
-])
-print(f"Generated answers are the same: {generated_same}")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 3ae507cac5ce1..f97a1f32e6210 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -417,6 +417,38 @@ def run_model(input_data,
     return pred_imgs
 
 
+def parse_args():
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+
+
 def main(
     data_file: str,
     output_dir: str,
@@ -496,35 +528,7 @@ def main(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
 
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        default="./India_900498_S2Hand.tif",
-        help="Path to the file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="output",
-        help="Path to the directory where to save outputs.",
-    )
-    parser.add_argument(
-        "--input_indices",
-        default=[1, 2, 3, 8, 11, 12],
-        type=int,
-        nargs="+",
-        help=
-        "0-based indices of the six Prithvi channels to be selected from the  "
-        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
-    )
-    parser.add_argument(
-        "--rgb_outputs",
-        action="store_true",
-        help="If present, output files will only contain RGB channels. "
-        "Otherwise, all bands will be saved.",
-    )
-    args = parser.parse_args()
+    args = parse_args()
 
     main(**vars(args))
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index ffa76b4e4f2ce..9c818d0757345 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
             sampling_params.max_tokens = next(output_len_generator)
             assert isinstance(sampling_params.max_tokens, int)
 
-            prompt_token_ids = torch.randint(
-                llm.llm_engine.model_config.get_vocab_size(),
-                size=(prompt_len, )).tolist()
+            prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
+                                             size=(prompt_len, )).tolist()
 
             llm.llm_engine.add_request(
                 request_id=f"seq{i}",
@@ -360,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f" in folder {context.save_chrome_traces_folder}")
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(description="""
 Profile a model
 
@@ -450,7 +449,10 @@ Profile a model
 
     EngineArgs.add_cli_args(parser)
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
@@ -459,3 +461,8 @@ Profile a model
             if k in inspect.signature(ProfileContext).parameters
         })
     run_profile(context, csv_output=args.csv, json_output=args.json)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
new file mode 100644
index 0000000000000..c30541a598cee
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -0,0 +1,32 @@
+# Qwen2.5-Omni Offline Inference Examples
+
+This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
+
+## Thinker Only
+
+```bash
+# Audio + image + video
+python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
+
+# Read vision and audio inputs from a single video file
+# NOTE: V1 engine does not support interleaved modalities yet.
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
+
+# Multiple audios
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
+```
+
+This script will run the thinker part of Qwen2.5-Omni, and generate text response.
+
+You can also test Qwen2.5-Omni on a single modality:
+
+```bash
+# Process audio inputs
+python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
+
+# Process image inputs
+python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
+
+# Process video inputs
+python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
+```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
new file mode 100644
index 0000000000000..c75a990120e07
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech.")
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = ("What is recited in the audio? "
+                "What is the content of this image? Why is this video funny?")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|vision_bos|><|IMAGE|><|vision_eos|>"
+              "<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio":
+                AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image":
+                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
+                "video":
+                VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "image": 1,
+            "video": 1
+        },
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = ("Describe the content of the video, "
+                "then convert what the baby say into text.")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
+                                  "Please launch this example with "
+                                  "`VLLM_USE_V1=0`.")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "video": 1
+        },
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|audio_bos|><|AUDIO|><|audio_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(model=model_name,
+              max_model_len=5632,
+              max_num_seqs=5,
+              limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+              seed=args.seed)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    outputs = llm.generate(query_result.inputs,
+                           sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--query-type',
+                        '-q',
+                        type=str,
+                        default="mixed_modalities",
+                        choices=query_map.keys(),
+                        help='Query type.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py
index d0197bf6d5ba0..b2be117d1a0a9 100644
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@@ -19,8 +19,6 @@ SEED = 42
 # because it is almost impossible to make the scheduling deterministic in the
 # online serving setting.
 
-llm = LLM(model="facebook/opt-125m", seed=SEED)
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -29,8 +27,17 @@ prompts = [
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index b0418c092ca3c..e0ed0ac49754b 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)
 
 outputs = ray.get(llm.generate.remote(prompts, sampling_params))
 
+print("-" * 50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
 
 # set up the communication between the training process
 # and the inference engine.
@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
 # use the updated model to generate texts, they will be nonsense
 # because the weights are all zeros.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
 for output in outputs_updated:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 863276432cb9c..338380cc96841 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,20 +29,23 @@ from pathlib import Path
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
-parser = FlexibleArgumentParser()
-EngineArgs.add_cli_args(parser)
-parser.add_argument("--output",
-                    "-o",
-                    required=True,
-                    type=str,
-                    help="path to output checkpoint")
-parser.add_argument("--file-pattern",
-                    type=str,
-                    help="string pattern of saved filenames")
-parser.add_argument("--max-file-size",
-                    type=str,
-                    default=5 * 1024**3,
-                    help="max size (in bytes) of each safetensors file")
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    EngineArgs.add_cli_args(parser)
+    parser.add_argument("--output",
+                        "-o",
+                        required=True,
+                        type=str,
+                        help="path to output checkpoint")
+    parser.add_argument("--file-pattern",
+                        type=str,
+                        help="string pattern of saved filenames")
+    parser.add_argument("--max-file-size",
+                        type=str,
+                        default=5 * 1024**3,
+                        help="max size (in bytes) of each safetensors file")
+    return parser.parse_args()
 
 
 def main(args):
@@ -57,10 +60,25 @@ def main(args):
     # Prepare output directory
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
-    model_executor = llm.llm_engine.model_executor
-    model_executor.save_sharded_state(path=args.output,
-                                      pattern=args.file_pattern,
-                                      max_size=args.max_file_size)
+
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
+
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm.llm_engine.engine_core.save_sharded_state(
+            path=args.output,
+            pattern=args.file_pattern,
+            max_size=args.max_file_size)
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(path=args.output,
+                                          pattern=args.file_pattern,
+                                          max_size=args.max_file_size)
+
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
         if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
@@ -72,5 +90,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index b45954b3bd54a..d583110c8e69b 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -18,8 +18,8 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-if __name__ == "__main__":
 
+def main():
     # Create an LLM.
     llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
@@ -32,11 +32,17 @@ if __name__ == "__main__":
     llm.stop_profile()
 
     # Print the outputs.
+    print("-" * 50)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
     # Add a buffer to wait for profiler in the background process
     # (in case MP is on) to finish writing profiling output.
     time.sleep(10)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 38ffd7fb9903d..363b500e0adf8 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,4 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of guided decoding 
+to generate structured outputs using vLLM. It shows how to apply 
+different guided decoding techniques such as Choice, Regex, JSON schema, 
+and Grammar to produce structured and formatted results 
+based on specific prompts.
+"""
 
 from enum import Enum
 
@@ -7,26 +14,21 @@ from pydantic import BaseModel
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams
 
-llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
-
 # Guided decoding by Choice (list of possible options)
-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_choice = GuidedDecodingParams(
+    choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(
+    guided_decoding=guided_decoding_params_choice)
+prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
 # Guided decoding by Regex
-guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
-                                 stop=["\n"])
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+sampling_params_regex = SamplingParams(
+    guided_decoding=guided_decoding_params_regex, stop=["\n"])
+prompt_regex = (
+    "Generate an email address for Alan Turing, who works in Enigma."
+    "End in .com and new line. Example result:"
+    "alan.turing@enigma.com\n")
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -44,37 +46,54 @@ class CarDescription(BaseModel):
 
 
 json_schema = CarDescription.model_json_schema()
-
-guided_decoding_params = GuidedDecodingParams(json=json_schema)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json)
+prompt_json = ("Generate a JSON with the brand, model and car_type of"
+               "the most iconic car from the 90's")
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
-    ?start: select_statement
-
-    ?select_statement: "SELECT " column_list " FROM " table_name
-
-    ?column_list: column_name ("," column_name)*
-
-    ?table_name: identifier
-
-    ?column_name: identifier
-
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+root ::= select_statement
+select_statement ::= "SELECT " column " from " table " where " condition
+column ::= "col_1 " | "col_2 "
+table ::= "table_1 " | "table_2 "
+condition ::= column "= " number
+number ::= "1 " | "2 "
 """
-guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_grammar = GuidedDecodingParams(
+    grammar=simplified_sql_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar)
+prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
+                  "from the 'users' table.")
+
+
+def format_output(title: str, output: str):
+    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
+
+
+def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
+    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    return outputs[0].outputs[0].text
+
+
+def main():
+    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+
+    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
+    format_output("Guided decoding by Choice", choice_output)
+
+    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
+    format_output("Guided decoding by Regex", regex_output)
+
+    json_output = generate_output(prompt_json, sampling_params_json, llm)
+    format_output("Guided decoding by JSON", json_output)
+
+    grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
+                                     llm)
+    format_output("Guided decoding by Grammar", grammar_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 35df6011550f2..c6d9e6b47e21f 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -23,20 +23,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Use `distributed_executor_backend="external_launcher"` so that
 # this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     distributed_executor_backend="external_launcher",
+    seed=0,
 )
 
 outputs = llm.generate(prompts, sampling_params)
 
 # all ranks will have the same outputs
+print("-" * 50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
 """
 Further tips:
 
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 4a8f17ba1d0d7..dea717c36082f 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -14,19 +14,24 @@ answers = [
 ]
 N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
-                                 top_p=1.0,
-                                 n=N,
-                                 max_tokens=16)
+sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
-# Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-          max_num_batched_tokens=64,
-          max_num_seqs=4)
-outputs = llm.generate(prompts, sampling_params)
-for output, answer in zip(outputs, answers):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    assert generated_text.startswith(answer)
+
+def main():
+    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+    # In real workloads, `enforace_eager` should be `False`.
+    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+              max_num_batched_tokens=64,
+              max_num_seqs=4)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0adbe574370d3..d02ac17cfdd68 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,6 +8,7 @@ on HuggingFace model repository.
 """
 import os
 import random
+from contextlib import contextmanager
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 
@@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=4096,
         max_num_seqs=2,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -60,6 +61,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Aya Vision
+def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"crop_to_patches": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -68,8 +91,8 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        model="Salesforce/blip2-opt-6.7b",
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -87,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
         model="facebook/chameleon-7b",
         max_model_len=4096,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -106,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=4096,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
         hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -127,11 +150,12 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
-        max_num_seqs=8,
+        tokenizer="Isotr0py/Florence-2-tokenizer",
+        max_model_len=4096,
+        max_num_seqs=2,
         trust_remote_code=True,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -151,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
         model="adept/fuyu-8b",
         max_model_len=2048,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -170,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=2048,
         max_num_seqs=2,
         mm_processor_kwargs={"do_pan_and_scan": True},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [("<bos><start_of_turn>user\n"
@@ -195,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
         trust_remote_code=True,
         enforce_eager=True,
         hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -222,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -263,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
                 "longest_edge": 3 * 364
             },
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     prompts = [(
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -275,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -285,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -312,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Kimi-VL
+def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
+        f"<|media_pad|><|media_end|>{question}<|im_end|>"
+        "<|im_assistant|>assistant<|im_middle|>" for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="moonshotai/Kimi-VL-A3B-Instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLaVA-1.5
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -323,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="llava-hf/llava-1.5-7b-hf",
         max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -340,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="llava-hf/llava-v1.6-mistral-7b-hf",
         max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -362,7 +437,7 @@ def run_llava_next_video(questions: list[str],
         model="llava-hf/LLaVA-NeXT-Video-7B-hf",
         max_model_len=8192,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -390,7 +465,7 @@ def run_llava_onevision(questions: list[str],
     engine_args = EngineArgs(
         model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
         max_model_len=16384,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -413,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     stop_token_ids = [128009]
 
@@ -454,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -497,6 +572,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
+# Mistral-3 HF-format
+def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLama 3.2
 def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -510,9 +608,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     # The configuration below has been confirmed to launch on a single L40 GPU.
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -536,6 +634,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [[{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -546,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -572,7 +706,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -599,7 +733,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     prompts = ["caption en" for _ in questions]
     engine_args = EngineArgs(
         model="google/paligemma-3b-mix-224",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -615,7 +750,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
     prompts = ["caption en" for _ in questions]
     engine_args = EngineArgs(
         model="google/paligemma2-3b-ft-docci-448",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -651,7 +787,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -678,10 +814,14 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=5120,
         max_num_seqs=2,
+        max_num_batched_tokens=12800,
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -700,9 +840,9 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     # NOTE: Need L40 (or equivalent) to avoid OOM
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
+        max_model_len=6144,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -723,7 +863,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=1024,
         max_num_seqs=2,
         hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -748,7 +888,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":
@@ -783,7 +923,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
             "max_pixels": 1280 * 28 * 28,
             "fps": 1,
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":
@@ -804,8 +944,80 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(questions: list[str], modality: str):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": [1],
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|IMAGE|>"
+    elif modality == "video":
+        placeholder = "<|VIDEO|>"
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
+                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# SkyworkR1V
+def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "Skywork/Skywork-R1V-38B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
+
+    # Stop tokens for SkyworkR1V
+    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
+    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 model_example_map = {
     "aria": run_aria,
+    "aya_vision": run_aya_vision,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
     "deepseek_vl_v2": run_deepseek_vl2,
@@ -816,6 +1028,7 @@ model_example_map = {
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
+    "kimi_vl": run_kimi_vl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
@@ -823,7 +1036,9 @@ model_example_map = {
     "mantis": run_mantis,
     "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
+    "mistral3": run_mistral3,
     "mllama": run_mllama,
+    "llama4": run_llama4,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
     "paligemma": run_paligemma,
@@ -834,6 +1049,9 @@ model_example_map = {
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
+    "qwen2_5_omni": run_qwen2_5_omni,
+    "skywork_chat": run_skyworkr1v,
+    "smolvlm": run_smolvlm,
 }
 
 
@@ -905,80 +1123,21 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
     return inputs
 
 
-def main(args):
-    model = args.model_type
-    if model not in model_example_map:
-        raise ValueError(f"Model type {model} is not supported.")
-
-    modality = args.modality
-    mm_input = get_multi_modal_input(args)
-    data = mm_input["data"]
-    questions = mm_input["questions"]
-
-    req_data = model_example_map[model](questions, modality)
-
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
-    llm = LLM(**engine_args)
-
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
-    # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = req_data.prompts if args.use_different_prompt_per_request else [
-        req_data.prompts[0]
-    ]
-
-    # We set temperature to 0.2 so that outputs can be different
-    # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
-
-    assert args.num_prompts > 0
-    if args.num_prompts == 1:
-        # Single inference
-        inputs = {
-            "prompt": prompts[0],
-            "multi_modal_data": {
-                modality: data
-            },
-        }
-    else:
-        # Batch inference
-        if args.image_repeat_prob is not None:
-            # Repeat images with specified probability of "image_repeat_prob"
-            inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompts,
-                                        modality)
-        else:
-            # Use the same image for all prompts
-            inputs = [{
-                "prompt": prompts[i % len(prompts)],
-                "multi_modal_data": {
-                    modality: data
-                },
-            } for i in range(args.num_prompts)]
-
-    if args.time_generate:
+@contextmanager
+def time_counter(enable: bool):
+    if enable:
         import time
         start_time = time.time()
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        yield
         elapsed_time = time.time() - start_time
+        print("-" * 50)
         print("-- generate time = {}".format(elapsed_time))
-
+        print("-" * 50)
     else:
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
+        yield
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models for text generation')
@@ -1028,6 +1187,86 @@ if __name__ == "__main__":
         action='store_true',
         help='If True, then use different prompt (with the same multi-modal '
         'data) for each request.')
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    questions = mm_input["questions"]
+
+    req_data = model_example_map[model](questions, modality)
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
+    engine_args = asdict(req_data.engine_args) | {
+        "seed": args.seed,
+        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+    }
+    llm = LLM(**engine_args)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompts[0],
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+    else:
+        # Batch inference
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
+
+    # Add LoRA request if applicable
+    lora_request = (req_data.lora_requests *
+                    args.num_prompts if req_data.lora_requests else None)
+
+    with time_counter(args.time_generate):
+        outputs = llm.generate(
+            inputs,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+        )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index a0b2b44b4e829..2637949551a1a 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
         model="royokong/e5-v",
         task="embed",
         max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
         task="embed",
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
@@ -143,12 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
         "multi_modal_data": mm_data,
     })
 
+    print("-" * 50)
     for output in outputs:
         print(output.outputs.embedding)
-
-
-def main(args: Namespace):
-    run_encode(args.model_name, args.modality, args.seed)
+        print("-" * 50)
 
 
 model_example_map = {
@@ -156,7 +161,8 @@ model_example_map = {
     "vlm2vec": run_vlm2vec,
 }
 
-if __name__ == "__main__":
+
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models for multimodal embedding')
@@ -175,6 +181,13 @@ if __name__ == "__main__":
                         type=int,
                         default=None,
                         help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality, args.seed)
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 98a739169d702..f165ea9efa10f 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
     "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
 ]
 
 
@@ -61,6 +71,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -182,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -218,19 +290,115 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=131072,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
+        max_model_len=8192,
+        max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    placeholders = "<|image|>" * len(image_urls)
-    prompt = f"{placeholders}<|begin_of_text|>{question}"
+    img_prompt = "Given the first image <|image|> and the second image<|image|>"
+    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
@@ -335,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
     )
 
     placeholders = "".join(f"<|image_{i}|>"
@@ -504,11 +674,15 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
+    "aya_vision": load_aya_vision,
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "kimi_vl": load_kimi_vl,
+    "llama4": load_llama4,
+    "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
@@ -517,6 +691,7 @@ model_example_map = {
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
+    "smolvlm": load_smolvlm,
 }
 
 
@@ -527,15 +702,8 @@ def run_generate(model, question: str, image_urls: list[str],
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)
 
     outputs = llm.generate(
@@ -545,29 +713,31 @@ def run_generate(model, question: str, image_urls: list[str],
                 "image": req_data.image_data
             },
         },
-        sampling_params=sampling_params)
+        sampling_params=sampling_params,
+        lora_request=req_data.lora_requests,
+    )
 
+    print("-" * 50)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        print("-" * 50)
 
 
 def run_chat(model: str, question: str, image_urls: list[str],
              seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)
     outputs = llm.chat(
         [{
@@ -588,27 +758,17 @@ def run_chat(model: str, question: str, image_urls: list[str],
         }],
         sampling_params=sampling_params,
         chat_template=req_data.chat_template,
+        lora_request=req_data.lora_requests,
     )
 
+    print("-" * 50)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        print("-" * 50)
 
 
-def main(args: Namespace):
-    model = args.model_type
-    method = args.method
-    seed = args.seed
-
-    if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS, seed)
-    elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS, seed)
-    else:
-        raise ValueError(f"Invalid method: {method}")
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models that support multi-image input for text '
@@ -628,6 +788,30 @@ if __name__ == "__main__":
                         type=int,
                         default=None,
                         help="Set the seed when initializing `vllm.LLM`.")
+    parser.add_argument(
+        "--num-images",
+        "-n",
+        choices=list(range(1, 13)),  # 12 is the max number of images
+        default=2,
+        help="Number of images to use for the demo.")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args: Namespace):
+    model = args.model_type
+    method = args.method
+    seed = args.seed
+
+    image_urls = IMAGE_URLS[:args.num_images]
+
+    if method == "generate":
+        run_generate(model, QUESTION, image_urls, seed)
+    elif method == "chat":
+        run_chat(model, QUESTION, image_urls, seed)
+    else:
+        raise ValueError(f"Invalid method: {method}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index e2944896d1610..36079ff11d07e 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
+Start the demo server:
+    python -m vllm.entrypoints.api_server --model <model_name>
+
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
 For production use, we recommend `vllm serve` and the OpenAI client API.
@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
 
 import argparse
 import json
+from argparse import Namespace
 from collections.abc import Iterable
 
 import requests
@@ -27,7 +31,6 @@ def post_http_request(prompt: str,
     pload = {
         "prompt": prompt,
         "n": n,
-        "use_beam_search": True,
         "temperature": 0.0,
         "max_tokens": 16,
         "stream": stream,
@@ -55,14 +58,17 @@ def get_response(response: requests.Response) -> list[str]:
     return output
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=4)
+    parser.add_argument("--n", type=int, default=1)
     parser.add_argument("--prompt", type=str, default="San Francisco is a")
     parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args: Namespace):
     prompt = args.prompt
     api_url = f"http://{args.host}:{args.port}/generate"
     n = args.n
@@ -83,3 +89,8 @@ if __name__ == "__main__":
         output = get_response(response)
         for i, line in enumerate(output):
             print(f"Beam candidate {i}: {line!r}", flush=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index fc434ada1d156..c2d4ef08ddbbe 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -2,32 +2,46 @@
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
+Note that `pip install cohere` is needed to run this example.
 
 run: vllm serve BAAI/bge-reranker-base
 """
+from typing import Union
+
 import cohere
+from cohere import Client, ClientV2
 
-# cohere v1 client
-co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
-rerank_v1_result = co.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
+model = "BAAI/bge-reranker-base"
 
-print(rerank_v1_result)
+query = "What is the capital of France?"
 
-# or the v2
-co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+documents = [
+    "The capital of France is Paris", "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving"
+]
 
-v2_rerank_result = co2.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
 
-print(v2_rerank_result)
+def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
+                  documents: list[str]) -> dict:
+    return client.rerank(model=model, query=query, documents=documents)
+
+
+def main():
+    # cohere v1 client
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000",
+                              api_key="sk-fake-key")
+    rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
+    print("-" * 50)
+    print("rerank_v1_result:\n", rerank_v1_result)
+    print("-" * 50)
+
+    # or the v2
+    cohere_v2 = cohere.ClientV2("sk-fake-key",
+                                base_url="http://localhost:8000")
+    rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
+    print("rerank_v2_result:\n", rerank_v2_result)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
new file mode 100644
index 0000000000000..a701636f357a8
--- /dev/null
+++ b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
@@ -0,0 +1,450 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file provides a disaggregated prefilling proxy demo to demonstrate an
+example usage of XpYd disaggregated prefilling.
+We can launch multiple vllm instances (2 for prefill and 2 for decode), and
+launch this proxy demo through:
+  python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py  \
+       --model $model_name  \
+       --prefill localhost:8100 localhost:8101   \
+       --decode localhost:8200 localhost:8201   \
+       --port 8000
+
+Note: This demo will be removed once the PDController implemented in PR 15343
+(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
+"""
+import argparse
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+import aiohttp
+import requests
+import uvicorn
+from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
+                     Request, status)
+from fastapi.responses import JSONResponse, StreamingResponse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+class SchedulingPolicy(ABC):
+
+    @abstractmethod
+    def schedule(self, cycler: itertools.cycle):
+        raise NotImplementedError("Scheduling Proxy is not set.")
+
+
+class Proxy:
+
+    def __init__(
+        self,
+        prefill_instances: list[str],
+        decode_instances: list[str],
+        model: str,
+        scheduling_policy: SchedulingPolicy,
+        custom_create_completion: Optional[Callable[[Request],
+                                                    StreamingResponse]] = None,
+        custom_create_chat_completion: Optional[Callable[
+            [Request], StreamingResponse]] = None,
+    ):
+        self.prefill_instances = prefill_instances
+        self.decode_instances = decode_instances
+        self.prefill_cycler = itertools.cycle(prefill_instances)
+        self.decode_cycler = itertools.cycle(decode_instances)
+        self.model = model
+        self.scheduling_policy = scheduling_policy
+        self.custom_create_completion = custom_create_completion
+        self.custom_create_chat_completion = custom_create_chat_completion
+        self.router = APIRouter()
+        self.setup_routes()
+
+    def setup_routes(self):
+        self.router.post(
+            "/v1/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_completion if self.
+               custom_create_completion else self.create_completion)
+        self.router.post(
+            "/v1/chat/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_chat_completion if self.
+               custom_create_chat_completion else self.create_chat_completion)
+        self.router.get("/status",
+                        response_class=JSONResponse)(self.get_status)
+        self.router.post("/instances/add",
+                         dependencies=[Depends(self.api_key_authenticate)
+                                       ])(self.add_instance_endpoint)
+
+    async def validate_json_request(self, raw_request: Request):
+        content_type = raw_request.headers.get("content-type", "").lower()
+        if content_type != "application/json":
+            raise HTTPException(
+                status_code=415,
+                detail=
+                "Unsupported Media Type: Only 'application/json' is allowed",
+            )
+
+    def api_key_authenticate(self, x_api_key: str = Header(...)):
+        expected_api_key = os.environ.get("ADMIN_API_KEY")
+        if not expected_api_key:
+            logger.error("ADMIN_API_KEY is not set in the environment.")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Server configuration error.",
+            )
+        if x_api_key != expected_api_key:
+            logger.warning("Unauthorized access attempt with API Key: %s",
+                           x_api_key)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: Invalid API Key.",
+            )
+
+    async def validate_instance(self, instance: str) -> bool:
+        url = f"http://{instance}/v1/models"
+        try:
+            async with aiohttp.ClientSession(
+                    timeout=AIOHTTP_TIMEOUT) as client:
+                logger.info("Verifying %s ...", instance)
+                async with client.get(url) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        if "data" in data and len(data["data"]) > 0:
+                            model_cur = data["data"][0].get("id", "")
+                            if model_cur == self.model:
+                                logger.info("Instance: %s could be added.",
+                                            instance)
+                                return True
+                            else:
+                                logger.warning("Mismatch model %s : %s != %s",
+                                               instance, model_cur, self.model)
+                                return False
+                        else:
+                            return False
+                    else:
+                        return False
+        except aiohttp.ClientError as e:
+            logger.error(str(e))
+            return False
+        except Exception as e:
+            logger.error(str(e))
+            return False
+
+    async def add_instance_endpoint(self, request: Request):
+        try:
+            data = await request.json()
+            logger.warning(str(data))
+            instance_type = data.get("type")
+            instance = data.get("instance")
+            if instance_type not in ["prefill", "decode"]:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance type.")
+            if not instance or ":" not in instance:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance format.")
+            host, port_str = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port_str)
+                if not (0 < port < 65536):
+                    raise HTTPException(status_code=400,
+                                        detail="Invalid port number.")
+            except Exception as e:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance address.") from e
+
+            is_valid = await self.validate_instance(instance)
+            if not is_valid:
+                raise HTTPException(status_code=400,
+                                    detail="Instance validation failed.")
+
+            if instance_type == "prefill":
+                if instance not in self.prefill_instances:
+                    self.prefill_instances.append(instance)
+                    self.prefill_cycler = itertools.cycle(
+                        self.prefill_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+            else:
+                if instance not in self.decode_instances:
+                    self.decode_instances.append(instance)
+                    self.decode_cycler = itertools.cycle(self.decode_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+
+            return JSONResponse(content={
+                "message":
+                f"Added {instance} to {instance_type}_instances."
+            })
+        except HTTPException as http_exc:
+            raise http_exc
+        except Exception as e:
+            logger.error("Error in add_instance_endpoint: %s", str(e))
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+    async def forward_request(self, url, data, use_chunked=True):
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {
+                "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+            }
+            try:
+                async with session.post(url=url, json=data,
+                                        headers=headers) as response:
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                        if use_chunked:
+                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                                    1024):
+                                yield chunk_bytes
+                        else:
+                            content = await response.read()
+                            yield content
+                    else:
+                        error_content = await response.text()
+                        try:
+                            error_content = json.loads(error_content)
+                        except json.JSONDecodeError:
+                            error_content = error_content
+                        logger.error("Request failed with status %s: %s",
+                                     response.status, error_content)
+                        raise HTTPException(
+                            status_code=response.status,
+                            detail=
+                            f"Request failed with status {response.status}: "
+                            f"{error_content}",
+                        )
+            except aiohttp.ClientError as e:
+                logger.error("ClientError occurred: %s", str(e))
+                raise HTTPException(
+                    status_code=502,
+                    detail=
+                    "Bad Gateway: Error communicating with upstream server.",
+                ) from e
+            except Exception as e:
+                logger.error("Unexpected error: %s", str(e))
+                raise HTTPException(status_code=500, detail=str(e)) from e
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return self.scheduling_policy.schedule(cycler)
+
+    async def get_status(self):
+        status = {
+            "prefill_node_count": len(self.prefill_instances),
+            "decode_node_count": len(self.decode_instances),
+            "prefill_nodes": self.prefill_instances,
+            "decode_nodes": self.decode_instances,
+        }
+        return status
+
+    async def create_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    f"http://{decode_instance}/v1/completions", request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(generator)
+            return response
+        except Exception:
+            import sys
+
+            exc_info = sys.exc_info()
+            print("Error occurred in disagg proxy server")
+            print(exc_info)
+
+    async def create_chat_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            # add params to request
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            # prefill stage
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/chat/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    "http://" + decode_instance + "/v1/chat/completions",
+                    request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(content=generator)
+            return response
+        except Exception:
+            exc_info = sys.exc_info()
+            error_messages = [str(e) for e in exc_info if e]
+            print("Error occurred in disagg proxy server")
+            print(error_messages)
+            return StreamingResponse(content=iter(error_messages),
+                                     media_type="text/event-stream")
+
+    def remove_instance_endpoint(self, instance_type, instance):
+        if (instance_type == "decode" and instance in self.decode_instances):
+            self.decode_instances.remove(instance)
+            self.decode_cycler = itertools.cycle(self.decode_instances)
+        if (instance_type == "prefill" and instance in self.decode_instances):
+            self.prefill_instances.remove(instance)
+            self.prefill_cycler = itertools.cycle(self.decode_instances)
+
+
+class RoundRobinSchedulingPolicy(SchedulingPolicy):
+
+    def __init__(self):
+        super().__init__()
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return next(cycler)
+
+
+class ProxyServer:
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        scheduling_policy: Optional[SchedulingPolicy] = None,
+        create_completion: Optional[Callable[[Request],
+                                             StreamingResponse]] = None,
+        create_chat_completion: Optional[Callable[[Request],
+                                                  StreamingResponse]] = None,
+    ):
+        self.validate_parsed_serve_args(args)
+        self.port = args.port
+        self.proxy_instance = Proxy(
+            prefill_instances=[] if args.prefill is None else args.prefill,
+            decode_instances=[] if args.decode is None else args.decode,
+            model=args.model,
+            scheduling_policy=(scheduling_policy if scheduling_policy
+                               is not None else RoundRobinSchedulingPolicy()),
+            custom_create_completion=create_completion,
+            custom_create_chat_completion=create_chat_completion,
+        )
+
+    def validate_parsed_serve_args(self, args: argparse.Namespace):
+        if not args.prefill:
+            raise ValueError("Please specify at least one prefill node.")
+        if not args.decode:
+            raise ValueError("Please specify at least one decode node.")
+        self.validate_instances(args.prefill)
+        self.validate_instances(args.decode)
+        self.verify_model_config(args.prefill, args.model)
+        self.verify_model_config(args.decode, args.model)
+
+    def validate_instances(self, instances: list):
+        for instance in instances:
+            if len(instance.split(":")) != 2:
+                raise ValueError(f"Invalid instance format: {instance}")
+            host, port = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port)
+                if not (0 < port < 65536):
+                    raise ValueError(
+                        f"Invalid port number in instance: {instance}")
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid instance {instance}: {str(e)}") from e
+
+    def verify_model_config(self, instances: list, model: str) -> None:
+        model_suffix = model.split("/")[-1]
+        for instance in instances:
+            try:
+                response = requests.get(f"http://{instance}/v1/models")
+                if response.status_code == 200:
+                    model_cur = response.json()["data"][0]["id"]
+                    model_cur_suffix = model_cur.split("/")[-1]
+                    if model_cur_suffix != model_suffix:
+                        raise ValueError(
+                            f"{instance} serves a different model: "
+                            f"{model_cur} != {model}")
+                else:
+                    raise ValueError(f"Cannot get model id from {instance}!")
+            except requests.RequestException as e:
+                raise ValueError(
+                    f"Error communicating with {instance}: {str(e)}") from e
+
+    def run_server(self):
+        app = FastAPI()
+        app.include_router(self.proxy_instance.router)
+        config = uvicorn.Config(app, port=self.port, loop="uvloop")
+        server = uvicorn.Server(config)
+        server.run()
+
+
+if __name__ == "__main__":
+    # Todo: allow more config
+    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
+    parser.add_argument("--model",
+                        "-m",
+                        type=str,
+                        required=True,
+                        help="Model name")
+
+    parser.add_argument(
+        "--prefill",
+        "-p",
+        type=str,
+        nargs="+",
+        help="List of prefill node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--decode",
+        "-d",
+        type=str,
+        nargs="+",
+        help="List of decode node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port number",
+    )
+    args = parser.parse_args()
+    proxy_server = ProxyServer(args=args)
+    proxy_server.run_server()
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index ee01e1eae6281..314f1c5b73951 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,52 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
 
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 
 import gradio as gr
 from openai import OpenAI
 
-# Argument parser setup
-parser = argparse.ArgumentParser(
-    description='Chatbot Interface with Customizable Parameters')
-parser.add_argument('--model-url',
-                    type=str,
-                    default='http://localhost:8000/v1',
-                    help='Model URL')
-parser.add_argument('-m',
-                    '--model',
-                    type=str,
-                    required=True,
-                    help='Model name for the chatbot')
-parser.add_argument('--temp',
-                    type=float,
-                    default=0.8,
-                    help='Temperature for text generation')
-parser.add_argument('--stop-token-ids',
-                    type=str,
-                    default='',
-                    help='Comma-separated stop token IDs')
-parser.add_argument("--host", type=str, default=None)
-parser.add_argument("--port", type=int, default=8001)
 
-# Parse the arguments
-args = parser.parse_args()
-
-# Set OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = args.model_url
-
-# Create an OpenAI client to interact with the API server
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-
-def predict(message, history):
-    # Convert chat history to OpenAI format
+def format_history_to_openai(history):
     history_openai_format = [{
         "role": "system",
-        "content": "You are a great ai assistant."
+        "content": "You are a great AI assistant."
     }]
     for human, assistant in history:
         history_openai_format.append({"role": "user", "content": human})
@@ -54,31 +34,92 @@ def predict(message, history):
             "role": "assistant",
             "content": assistant
         })
+    return history_openai_format
+
+
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    # Format history to OpenAI chat format
+    history_openai_format = format_history_to_openai(history)
     history_openai_format.append({"role": "user", "content": message})
 
-    # Create a chat completion request and send it to the API server
+    # Send request to OpenAI API (vLLM server)
     stream = client.chat.completions.create(
-        model=args.model,  # Model name to use
-        messages=history_openai_format,  # Chat history
-        temperature=args.temp,  # Temperature for text generation
-        stream=True,  # Stream response
+        model=model_name,
+        messages=history_openai_format,
+        temperature=temp,
+        stream=True,
         extra_body={
             'repetition_penalty':
             1,
-            'stop_token_ids': [
-                int(id.strip()) for id in args.stop_token_ids.split(',')
-                if id.strip()
-            ] if args.stop_token_ids else []
+            'stop_token_ids':
+            [int(id.strip())
+             for id in stop_token_ids.split(',')] if stop_token_ids else []
         })
 
-    # Read and return generated text from response stream
-    partial_message = ""
+    # Collect all chunks and concatenate them into a full message
+    full_message = ""
     for chunk in stream:
-        partial_message += (chunk.choices[0].delta.content or "")
-        yield partial_message
+        full_message += (chunk.choices[0].delta.content or "")
+
+    # Return the full message as a single response
+    return full_message
 
 
-# Create and launch a chat interface with Gradio
-gr.ChatInterface(predict).queue().launch(server_name=args.host,
-                                         server_port=args.port,
-                                         share=True)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Chatbot Interface with Customizable Parameters')
+    parser.add_argument('--model-url',
+                        type=str,
+                        default='http://localhost:8000/v1',
+                        help='Model URL')
+    parser.add_argument('-m',
+                        '--model',
+                        type=str,
+                        required=True,
+                        help='Model name for the chatbot')
+    parser.add_argument('--temp',
+                        type=float,
+                        default=0.8,
+                        help='Temperature for text generation')
+    parser.add_argument('--stop-token-ids',
+                        type=str,
+                        default='',
+                        help='Comma-separated stop token IDs')
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+
+
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp,
+                       stop_token_ids)
+
+    return gr.ChatInterface(fn=chat_predict,
+                            title="Chatbot Interface",
+                            description="A simple chatbot powered by vLLM")
+
+
+def main():
+    # Parse the arguments
+    args = parse_args()
+
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(client, args.model, args.temp,
+                                              args.stop_token_ids)
+
+    gradio_interface.queue().launch(server_name=args.host,
+                                    server_port=args.port,
+                                    share=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index 85a9119c6aa2f..2e7c2a0c5838c 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,5 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
 
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 import json
 
@@ -39,16 +56,23 @@ def build_demo():
     return demo
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8001)
     parser.add_argument("--model-url",
                         type=str,
                         default="http://localhost:8000/generate")
-    args = parser.parse_args()
+    return parser.parse_args()
 
+
+def main(args):
     demo = build_demo()
     demo.queue().launch(server_name=args.host,
                         server_port=args.port,
                         share=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 3e760e1717883..3076bba765ce5 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -23,12 +23,19 @@ data = {
         "The capital of France is Paris.", "Horses and cows are both animals"
     ]
 }
-response = requests.post(url, headers=headers, json=data)
 
-# Check the response
-if response.status_code == 200:
-    print("Request successful!")
-    print(json.dumps(response.json(), indent=2))
-else:
-    print(f"Request failed with status code: {response.status_code}")
-    print(response.text)
+
+def main():
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index a815620411309..74e0c045d6214 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,38 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
-
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+messages = [{
+    "role": "system",
+    "content": "You are a helpful assistant."
+}, {
+    "role": "user",
+    "content": "Who won the world series in 2020?"
+}, {
+    "role": "assistant",
+    "content": "The Los Angeles Dodgers won the World Series in 2020."
+}, {
+    "role": "user",
+    "content": "Where was it played?"
+}]
 
-models = client.models.list()
-model = models.data[0].id
 
-chat_completion = client.chat.completions.create(
-    messages=[{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "Who won the world series in 2020?"
-    }, {
-        "role":
-        "assistant",
-        "content":
-        "The Los Angeles Dodgers won the World Series in 2020."
-    }, {
-        "role": "user",
-        "content": "Where was it played?"
-    }],
-    model=model,
-)
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-print("Chat completion results:")
-print(chat_completion)
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+    )
+
+    print("-" * 50)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ecfcf05a90d16..70db4d95e6494 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 
 (multi-image inference with Phi-3.5-vision-instruct)
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
@@ -303,12 +303,7 @@ example_function_map = {
 }
 
 
-def main(args) -> None:
-    chat_type = args.chat_type
-    example_function_map[chat_type]()
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ if __name__ == "__main__":
                         default="single-image",
                         choices=list(example_function_map.keys()),
                         help='Conversation type with multimodal data.')
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 416fb61ca8bb5..c25203860ff39 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
             --enable-auto-tool-choice --tool-call-parser hermes
 """
 import json
+from typing import Any
 
 from openai import OpenAI
 
@@ -24,15 +25,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -78,86 +70,123 @@ messages = [{
     "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
 }]
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools)
 
-print("Chat completion results:")
-print(chat_completion)
-print("\n\n")
-
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-    if chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls[0])
-    else:
-        print(chunk.choices[0].delta)
-
-arguments = []
-tool_call_idx = -1
-for chunk in chunks:
-
-    if chunk.choices[0].delta.tool_calls:
-        tool_call = chunk.choices[0].delta.tool_calls[0]
-
-        if tool_call.index != tool_call_idx:
-            if tool_call_idx >= 0:
-                print(
-                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
-                )
-            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
-            arguments.append("")
-        if tool_call.id:
-            print(f"streamed tool call id: {tool_call.id} ")
-
-        if tool_call.function:
-            if tool_call.function.name:
-                print(f"streamed tool call name: {tool_call.function.name}")
-
-            if tool_call.function.arguments:
-                arguments[tool_call_idx] += tool_call.function.arguments
-
-if len(arguments):
-    print(f"streamed tool call arguments: {arguments[-1]}")
-
-print("\n\n")
-
-messages.append({
-    "role": "assistant",
-    "tool_calls": chat_completion.choices[0].message.tool_calls
-})
-
-
-# Now, simulate a tool call
 def get_current_weather(city: str, state: str, unit: 'str'):
     return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
             "partly cloudly, with highs in the 90's.")
 
 
-available_tools = {"get_current_weather": get_current_weather}
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
 
-completion_tool_calls = chat_completion.choices[0].message.tool_calls
-for call in completion_tool_calls:
-    tool_to_call = available_tools[call.function.name]
-    args = json.loads(call.function.arguments)
-    result = tool_to_call(**args)
-    print(result)
+
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: "
+                          f"{arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(
+                        f"streamed tool call name: {tool_call.function.name}")
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+
+    return arguments
+
+
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools)
+
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+
+    print("-" * 70)
+
+    # Add tool call results to the conversation
     messages.append({
-        "role": "tool",
-        "content": result,
-        "tool_call_id": call.id,
-        "name": call.function.name
+        "role": "assistant",
+        "tool_calls": chat_completion.choices[0].message.tool_calls
     })
 
-chat_completion_2 = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=False)
-print("\n\n")
-print(chat_completion_2)
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append({
+            "role": "tool",
+            "content": result,
+            "tool_call_id": call.id,
+            "name": call.function.name
+        })
+
+    chat_completion_2 = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=False)
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
new file mode 100644
index 0000000000000..97d900bb75f1a
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you can start the vLLM server
+without any specific flags:
+
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+
+This example demonstrates how to generate chat completions
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "the two-letter abbreviation for the state that the "
+                        "city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "The two-letter abbreviation for the state, e.g. 'NY'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hi! How are you doing today?"
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well! How can I help you?"
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools,
+                                                     tool_choice="required")
+
+    print(chat_completion.choices[0].message.tool_calls)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 9e7a69c6c87d6..8c6470aa3dd41 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
     return reasoning_content, arguments, function_names
 
 
-print("---------Full Generate With Automatic Function Calling-------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools)
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
-print(f"function arguments: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-print("----------Stream Generate With Automatic Function Calling-----------")
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
+    print(
+        "---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools)
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(f"function arguments: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
 
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
+    print(
+        "----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
 
-print("----------Full Generate With Named Function Calling-----------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools,
-                                            tool_choice={
-                                                "type": "function",
-                                                "function": {
-                                                    "name":
-                                                    "get_current_weather"
-                                                }
-                                            })
+    chunks = list(tool_calls_stream)
 
-tool_call = tool_calls.choices[0].message.tool_calls[0].function
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: {tool_call.name}")
-print(f"function arguments: {tool_call.arguments}")
-print("----------Stream Generate With Named Function Calling--------------")
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
 
-tool_calls_stream = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice={
-        "type": "function",
-        "function": {
-            "name": "get_current_weather"
-        }
-    },
-    stream=True)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
 
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
+    print(
+        "----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools,
+                                                tool_choice={
+                                                    "type": "function",
+                                                    "function": {
+                                                        "name":
+                                                        "get_current_weather"
+                                                    }
+                                                })
 
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
-print("\n\n")
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print(
+        "----------Stream Generate With Named Function Calling--------------")
+
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "get_current_weather"
+            }
+        },
+        stream=True)
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index b5dbed1205d35..6f5f7b5fa20ba 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -3,8 +3,8 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
-parser:
+To run this example, you need to start the vLLM server
+with the reasoning parser:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@@ -21,34 +21,44 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 1:", reasoning_content)
-print("content for Round 1:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
 
-# Round 2
-messages.append({"role": "assistant", "content": content})
-messages.append({
-    "role": "user",
-    "content": "How many Rs are there in the word 'strawberry'?",
-})
-response = client.chat.completions.create(model=model, messages=messages)
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "How many Rs are there in the word 'strawberry'?",
+    })
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 2:", reasoning_content)
-print("content for Round 2:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index fe4332576d438..90481cdc0fb79 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -3,7 +3,7 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 
 ```bash
@@ -29,40 +29,49 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
 
-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
 
-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 2c63c5ec370e3..c850b5aa2f800 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
     print("Embedding output:", response_json["data"][0]["embedding"])
 
 
-if __name__ == '__main__':
+def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
         "the model with --task embed before running this.")
@@ -107,8 +107,10 @@ if __name__ == '__main__':
                         choices=["vlm2vec", "dse_qwen2_vl"],
                         required=True,
                         help="Which model to call.")
-    args = parser.parse_args()
+    return parser.parse_args()
 
+
+def main(args):
     if args.model == "vlm2vec":
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ if __name__ == '__main__':
             "type": "text",
             "content": "What is the weather like today?",
         })
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 06b93d7d19315..6ab7619bff192 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,28 +6,36 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="A robot may not injure a human being",
-    echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
+    models = client.models.list()
+    model = models.data[0].id
 
-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=stream,
+        logprobs=3)
+
+    print("-" * 50)
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 67c5fc91bc65b..20a64ddb21413 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/score"
     model_name = args.model
 
@@ -30,9 +32,9 @@ if __name__ == "__main__":
     text_2 = "The capital of Brazil is Brasilia."
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both strings:")
+    print("\nPrompt when text_1 and text_2 are both strings:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = "What is the capital of France?"
@@ -41,9 +43,9 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 is string and text_2 is a list:")
+    print("\nPrompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = [
@@ -54,7 +56,12 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both lists:")
+    print("\nPrompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index b7c5651e3bab2..bc217f7ca7a0b 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -6,22 +6,29 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-responses = client.embeddings.create(
-    input=[
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ],
-    model=model,
-)
+    models = client.models.list()
+    model = models.data[0].id
 
-for data in responses.data:
-    print(data.embedding)  # List of float of len 4096
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models"
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
new file mode 100644
index 0000000000000..27ab8cb64037b
--- /dev/null
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Example Python client for embedding API dimensions using vLLM API server
+NOTE:
+    start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
+    vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        input=["Follow the white rabbit."],
+        model=model,
+        dimensions=1,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 1
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index e17f9c5efd659..abcfe27c27699 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
@@ -25,15 +25,20 @@ if __name__ == "__main__":
                         type=str,
                         default="jason9693/Qwen2.5-1.5B-apeach")
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/pooling"
     model_name = args.model
 
     # Input like Completions API
     prompt = {"model": model_name, "input": "vLLM is great!"}
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
 
     # Input like Chat API
     prompt = {
@@ -50,3 +55,9 @@ if __name__ == "__main__":
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 494e7c8ebe12a..5fcb7c5264162 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -23,10 +23,15 @@ def sync_openai():
     with open(str(mary_had_lamb), "rb") as f:
         transcription = client.audio.transcriptions.create(
             file=f,
-            model="openai/whisper-small",
+            model="openai/whisper-large-v3",
             language="en",
             response_format="json",
-            temperature=0.0)
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ))
         print("transcription result:", transcription.text)
 
 
diff --git a/examples/template_florence2.jinja b/examples/template_florence2.jinja
new file mode 100644
index 0000000000000..d257aed6a85b0
--- /dev/null
+++ b/examples/template_florence2.jinja
@@ -0,0 +1,7 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja
index 8c38de6c6a907..e4ec2353b3509 100644
--- a/examples/tool_chat_template_llama3.2_pythonic.jinja
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -76,7 +76,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}
diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja
new file mode 100644
index 0000000000000..bd18a35bdda93
--- /dev/null
+++ b/examples/tool_chat_template_llama4_pythonic.jinja
@@ -0,0 +1,139 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {#- Add default tool system message when tools are provided #}
+        {%- set system_message = "You are a helpful assistant with tool calling "
+            "capabilities. Only reply with a tool call if the function exists in the "
+            "library provided by the user. If it doesn't exist, just reply directly in "
+            "natural language. When you receive a tool call response, use the output to "
+            "format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message %}
+    {#- always use user provided system message to override default tool system message #}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {{- system_message }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "Tools: You have access to the following tools. You might need to use one "
+            "or more function/tool calls to fulfill the task. \n"
+            "If none are needed, then proceed to the response.\n\n"
+            "Tool Call Syntax: You can call tools using the following syntax:\n"
+            "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
+            "Do not include anything else when calling the tools with the syntax above.\n\n"
+            "Here is a list of functions in JSON format that you can invoke.\n " }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and tools is not none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' -}}
+    {{- first_user_message}}
+    {{- "\nHere is a list of functions in JSON format that you can invoke:"}}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "Should you decide to return the function call(s), put them in the format "
+        "of [func_name1(params_name1=params_value1, params_name2=params_value2, "
+        "...), ...]\nDo not include anything else when calling the tools with the "
+        "syntax above." }}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- "<|eom|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is string %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- content['text'] | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
new file mode 100644
index 0000000000000..36423b6c4240a
--- /dev/null
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -0,0 +1,60 @@
+{%- if messages %}
+    {%- if system_message or tools %}
+<|system|>
+
+{%- if system_message %}
+{{ system_message }}
+{%- endif %}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+
+{%- if tools %}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+{%- endif %}<|end|>
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message.role != "system" %}
+<|{{ message.role }}|>
+            {%- if message.content and message.role == "tools" %}
+{"result": {{ message.content }}}
+            {%- elif message.content %}
+{{ message.content }}
+            {%- elif message.tool_calls %}
+                {%- for call in message.tool_calls %}
+{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
+                    {%- if not loop.last %},{% endif %}
+                {%- endfor %}
+            {%- endif %}<|end|>
+        {%- endif %}
+    {%- endfor %}<|assistant|>
+
+{%- else %}
+    {%- if system_message %}
+<|system|>
+
+{{ system_message }}<|end|>
+    {%- endif %}
+    {%- if prompt %}
+<|user|>
+
+{{ prompt }}<|end|>
+    {%- endif %}<|assistant|>
+
+{%- endif %}
+{{ response }}
+{%- if response %}<|user|>{% endif %}
\ No newline at end of file
diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja
index a9b3b7189dddf..da0f25cdcb337 100644
--- a/examples/tool_chat_template_toolace.jinja
+++ b/examples/tool_chat_template_toolace.jinja
@@ -44,7 +44,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}
diff --git a/format.sh b/format.sh
index fb503ec4bbfcd..6ba93e0a19ba8 100755
--- a/format.sh
+++ b/format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-echo "vLLM linting system has been moved from format.sh to pre-commit hook."
+echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
 echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
-echo "Then linters will run automatically before each commit."
+echo "'pre-commit install' to install the pre-commit hooks."
+echo "Then linters will run automatically before each commit."
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 07616c858f1f3..167e975c70fdb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.9,<3.13"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]
diff --git a/python_only_dev.py b/python_only_dev.py
deleted file mode 100644
index a303697b780a6..0000000000000
--- a/python_only_dev.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
-
-TL;DR:
-
-VLLM_USE_PRECOMPILED=1 pip install -e .
-
-or
-
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install -e .
-""" # noqa
-
-print(msg)
diff --git a/requirements/common.txt b/requirements/common.txt
index 14084b79121bb..33c4c3219f159 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,13 +1,14 @@
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
+numpy
 requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
-tokenizers >= 0.19.1  # Required for Llama 3.
+transformers >= 4.51.1
+huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
@@ -21,23 +22,28 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
-pyzmq
+pyzmq >= 25.0.0
 msgspec
-gguf == 0.10.0
+gguf >= 0.13.0
 importlib_metadata
 mistral_common[opencv] >= 1.5.4
+opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.2 # required for compressed-tensors
+compressed-tensors == 0.9.3 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
+opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index fc09083781e6f..69f732c2417a1 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -12,6 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchaudio==2.6.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
+torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
+
+# cpu cannot use triton 3.3.0
+triton==3.2.0; platform_machine == "x86_64"
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 702d4b0bb320c..cdc6ee75afbcd 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -1,10 +1,11 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.6.0
 torchaudio==2.6.0
 # These must be updated alongside torch
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 416ca503b36c0..99fb87def6dd2 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,6 +18,7 @@ transformers
 mistral_common >= 1.5.4
 aiohttp
 starlette
+scipy
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/requirements/hpu.txt b/requirements/hpu.txt
index a61d72d04f409..830f6ef3f50cb 100644
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -5,6 +5,7 @@
 ray
 triton==3.1.0
 pandas
+numpy==1.26.4
 tabulate
 setuptools>=61
 setuptools-scm>=8
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
new file mode 100644
index 0000000000000..20372a9b2ef16
--- /dev/null
+++ b/requirements/nightly_torch_test.txt
@@ -0,0 +1,28 @@
+# Dependency that able to run entrypoints test
+# pytest and its extensions
+pytest
+pytest-asyncio
+pytest-forked
+pytest-mock
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+
+librosa # required by audio tests in entrypoints/openai
+sentence-transformers
+numba == 0.61.2; python_version > '3.9'
+# testing utils
+awscli
+boto3
+botocore
+datasets
+ray >= 2.10.0
+peft
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+tensorizer>=2.9.0
+lm-eval==0.4.8
+buildkite-test-collector==0.1.9
+
+lm-eval[api]==0.4.8 # required for model evaluation test
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 6af78da4993db..05de4ff168453 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,7 +6,8 @@ torch==2.6.0
 torchvision==0.21.0
 torchaudio==2.6.0
 
-cmake>=3.26
+triton==3.2
+cmake>=3.26,<4
 packaging
 setuptools>=61
 setuptools-scm>=8
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 345c84b0f6cf2..4df92aab3749e 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,7 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for AMD GPUs
 awscli
diff --git a/requirements/test.in b/requirements/test.in
index 5c59bbd1ac7ae..3be580db0674c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -5,11 +5,12 @@ pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
+pytest-timeout
 
 # testing utils
 awscli
 backoff # required for phi4mm test
-decord # required for video tests
+blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
@@ -17,7 +18,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
@@ -26,11 +27,17 @@ torch==2.6.0
 torchaudio==2.6.0
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
+mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
+num2words # required for smolvlm test
+opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2
+lm-eval[api]==0.4.8 # required for model evaluation test
+transformers==4.51.1
+tokenizers==0.21.1
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
@@ -38,7 +45,9 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
-numpy < 2.0.0
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
diff --git a/requirements/test.txt b/requirements/test.txt
index b0ae479604a1e..6dcd4ff01460c 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -20,25 +20,35 @@ aiosignal==1.3.1
 annotated-types==0.7.0
     # via pydantic
 anyio==4.6.2.post1
-    # via httpx
+    # via
+    #   httpx
+    #   starlette
 argcomplete==3.5.1
     # via datamodel-code-generator
+arrow==1.3.0
+    # via isoduration
 attrs==24.2.0
     # via
     #   aiohttp
+    #   hypothesis
     #   jsonlines
     #   jsonschema
+    #   pytest-subtests
     #   referencing
 audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements/test.in
 backoff==2.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
+blobfile==3.0.0
+    # via -r requirements/test.in
 boto3==1.35.57
     # via tensorizer
 botocore==1.35.57
@@ -67,11 +77,13 @@ click==8.1.7
     #   jiwer
     #   nltk
     #   ray
+    #   schemathesis
     #   typer
 colorama==0.4.6
     # via
     #   awscli
     #   sacrebleu
+    #   schemathesis
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
@@ -93,8 +105,6 @@ datasets==3.0.2
     #   lm-eval
 decorator==5.1.1
     # via librosa
-decord==0.6.0
-    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -103,12 +113,15 @@ dill==0.3.8
     #   multiprocess
 dnspython==2.7.0
     # via email-validator
+docopt==0.6.2
+    # via num2words
 docutils==0.16
     # via awscli
 einops==0.8.0
     # via
     #   -r requirements/test.in
     #   encodec
+    #   mamba-ssm
     #   vector-quantize-pytorch
     #   vocos
 einx==0.3.0
@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
     # via -r requirements/test.in
 filelock==3.16.1
     # via
+    #   blobfile
     #   datasets
     #   huggingface-hub
     #   ray
@@ -134,6 +148,8 @@ filelock==3.16.1
     #   transformers
 fonttools==4.54.1
     # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
 frozendict==2.4.6
     # via einx
 frozenlist==1.5.0
@@ -152,16 +168,25 @@ genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
+graphql-core==3.2.6
+    # via hypothesis-graphql
 h11==0.14.0
     # via httpcore
+harfile==0.3.0
+    # via schemathesis
+hf-xet==0.1.4
+    # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
-    # via -r requirements/test.in
-huggingface-hub==0.26.2
     # via
+    #   -r requirements/test.in
+    #   schemathesis
+huggingface-hub==0.30.1
+    # via
+    #   -r requirements/test.in
     #   accelerate
     #   datasets
     #   evaluate
@@ -173,17 +198,29 @@ huggingface-hub==0.26.2
     #   vocos
 humanize==4.11.0
     # via runai-model-streamer
+hypothesis==6.131.0
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.11.1
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
 idna==3.10
     # via
     #   anyio
     #   email-validator
     #   httpx
+    #   jsonschema
     #   requests
     #   yarl
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
     # via pytest
+isoduration==20.11.0
+    # via jsonschema
 isort==5.13.2
     # via datamodel-code-generator
 jinja2==3.1.6
@@ -203,12 +240,18 @@ joblib==1.4.2
     #   scikit-learn
 jsonlines==4.0.0
     # via lm-eval
+jsonpointer==3.0.0
+    # via jsonschema
 jsonschema==4.23.0
     # via
+    #   hypothesis-jsonschema
     #   mistral-common
     #   ray
+    #   schemathesis
 jsonschema-specifications==2024.10.1
     # via jsonschema
+junit-xml==1.9
+    # via schemathesis
 kaleido==0.2.1
     # via genai-perf
 kiwisolver==1.4.7
@@ -219,16 +262,22 @@ libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
     # via -r requirements/test.in
-llvmlite==0.43.0
+llvmlite==0.44.0
     # via numba
-lm-eval==0.4.4
+lm-eval==0.4.8
     # via -r requirements/test.in
 lxml==5.3.0
-    # via sacrebleu
+    # via
+    #   blobfile
+    #   sacrebleu
+mamba-ssm==2.2.4
+    # via -r requirements/test.in
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
-    # via jinja2
+    # via
+    #   jinja2
+    #   werkzeug
 matplotlib==3.9.2
     # via -r requirements/test.in
 mbstrdecoder==1.1.3
@@ -260,10 +309,16 @@ mypy-extensions==1.0.0
     # via black
 networkx==3.2.1
     # via torch
+ninja==1.11.1.3
+    # via mamba-ssm
 nltk==3.9.1
     # via rouge-score
-numba==0.60.0
-    # via librosa
+num2words==0.5.14
+    # via -r requirements/test.in
+numba==0.61.2
+    # via
+    #   -r requirements/test.in
+    #   librosa
 numexpr==2.10.1
     # via lm-eval
 numpy==1.26.4
@@ -274,7 +329,6 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
-    #   decord
     #   einx
     #   encodec
     #   evaluate
@@ -335,8 +389,10 @@ nvidia-nvjitlink-cu12==12.4.127
     #   torch
 nvidia-nvtx-cu12==12.4.127
     # via torch
-opencv-python-headless==4.10.0.84
-    # via mistral-common
+opencv-python-headless==4.11.0.86
+    # via
+    #   -r requirements/test.in
+    #   mistral-common
 packaging==24.1
     # via
     #   accelerate
@@ -347,6 +403,7 @@ packaging==24.1
     #   fastparquet
     #   huggingface-hub
     #   lazy-loader
+    #   mamba-ssm
     #   matplotlib
     #   peft
     #   plotly
@@ -418,6 +475,8 @@ pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
     # via cffi
+pycryptodomex==3.22.0
+    # via blobfile
 pydantic==2.9.2
     # via
     #   datamodel-code-generator
@@ -428,6 +487,8 @@ pygments==2.18.0
     # via rich
 pyparsing==3.2.0
     # via matplotlib
+pyrate-limiter==3.7.0
+    # via schemathesis
 pytablewriter==1.2.0
     # via lm-eval
 pytest==8.3.3
@@ -440,6 +501,9 @@ pytest==8.3.3
     #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
+    #   pytest-subtests
+    #   pytest-timeout
+    #   schemathesis
 pytest-asyncio==0.24.0
     # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -450,8 +514,13 @@ pytest-rerunfailures==14.0
     # via -r requirements/test.in
 pytest-shard==0.1.2
     # via -r requirements/test.in
+pytest-subtests==0.14.1
+    # via schemathesis
+pytest-timeout==2.3.1
+    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
+    #   arrow
     #   botocore
     #   matplotlib
     #   pandas
@@ -473,6 +542,7 @@ pyyaml==6.0.2
     #   peft
     #   ray
     #   responses
+    #   schemathesis
     #   timm
     #   transformers
     #   vocos
@@ -503,10 +573,16 @@ requests==2.32.3
     #   pooch
     #   ray
     #   responses
+    #   schemathesis
+    #   starlette-testclient
     #   tiktoken
     #   transformers
 responses==0.25.3
     # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
 rich==13.9.4
     # via
     #   genai-perf
@@ -535,6 +611,8 @@ safetensors==0.4.5
     #   peft
     #   timm
     #   transformers
+schemathesis==3.39.15
+    # via -r requirements/test.in
 scikit-learn==1.5.2
     # via
     #   librosa
@@ -553,18 +631,23 @@ sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
     # via
+    #   mamba-ssm
     #   pytablewriter
     #   torch
 shellingham==1.5.4
     # via typer
 six==1.16.0
     # via
+    #   junit-xml
     #   python-dateutil
+    #   rfc3339-validator
     #   rouge-score
 sniffio==1.3.1
     # via
     #   anyio
     #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
 soundfile==0.12.1
     # via
     #   -r requirements/test.in
@@ -573,6 +656,12 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+starlette==0.46.2
+    # via
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
 sympy==1.13.1
@@ -599,8 +688,14 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements/test.in
-tokenizers==0.21.0
-    # via transformers
+tokenizers==0.21.1
+    # via
+    #   -r requirements/test.in
+    #   transformers
+tomli==2.2.1
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
 torch==2.6.0
     # via
     #   -r requirements/test.in
@@ -609,6 +704,7 @@ torch==2.6.0
     #   encodec
     #   fastsafetensors
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   runai-model-streamer
     #   sentence-transformers
@@ -641,11 +737,12 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.48.2
+transformers==4.51.1
     # via
     #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
@@ -664,6 +761,8 @@ typepy==1.3.2
     #   tabledata
 typer==0.15.2
     # via fastsafetensors
+types-python-dateutil==2.9.0.20241206
+    # via arrow
 typing-extensions==4.12.2
     # via
     #   huggingface-hub
@@ -676,8 +775,11 @@ typing-extensions==4.12.2
     #   typer
 tzdata==2024.2
     # via pandas
+uri-template==1.3.0
+    # via jsonschema
 urllib3==2.2.3
     # via
+    #   blobfile
     #   botocore
     #   requests
     #   responses
@@ -686,6 +788,10 @@ vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
 vocos==0.1.0
     # via -r requirements/test.in
+webcolors==24.11.1
+    # via jsonschema
+werkzeug==3.1.3
+    # via schemathesis
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
@@ -693,6 +799,8 @@ xxhash==3.5.0
     #   datasets
     #   evaluate
 yarl==1.17.1
-    # via aiohttp
+    # via
+    #   aiohttp
+    #   schemathesis
 zstandard==0.23.0
     # via lm-eval
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 35d5db6c46006..b63993ba1ee45 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250408
+torchvision==0.22.0.dev20250408
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+
diff --git a/setup.py b/setup.py
index 37f3e78926c6e..b0cc2f48163c3 100755
--- a/setup.py
+++ b/setup.py
@@ -201,6 +201,9 @@ class cmake_build_ext(build_ext):
         else:
             # Default build tool to whatever cmake picks.
             build_tool = []
+        # Make sure we use the nvcc from CUDA_HOME
+        if _is_cuda():
+            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
         subprocess.check_call(
             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
             cwd=self.build_temp)
@@ -592,9 +595,8 @@ def get_requirements() -> list[str]:
         for line in requirements:
             if line.startswith("-r "):
                 resolved_requirements += _read_requirements(line.split()[1])
-            elif line.startswith("--"):
-                continue
-            else:
+            elif not line.startswith("--") and not line.startswith(
+                    "#") and line.strip() != "":
                 resolved_requirements.append(line)
         return resolved_requirements
 
@@ -640,11 +642,10 @@ if _is_hip():
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
-        # FA3 requires CUDA 12.0 or later
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # FA3 requires CUDA 12.3 or later
         ext_modules.append(
             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
         # Optional since this doesn't get built (produce an .so file) when
         # not targeting a hopper system
         ext_modules.append(
@@ -683,7 +684,7 @@ setup(
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
-        "video": ["decord"]  # Required for video processing
+        "video": []  # Kept for backwards compatibility
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 436e43638a3dd..be3ad12396b4b 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,15 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 31aa898282004..76b266aada684 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
 
         llm.wake_up()
         output2 = llm.generate(prompt, sampling_params)
-
         # cmp output
         assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+        llm.sleep(level=1)
+        llm.wake_up(tags=["weights"])
+
+        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+        # should just reallocate memory for weights (1B model, ~2GiB weights)
+        if use_v1:
+            assert used_bytes < 10 * GiB_bytes
+        else:
+            assert used_bytes < 6 * GiB_bytes
+
+        # now allocate kv cache memory
+        llm.wake_up(tags=["kv_cache"])
+        output3 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output3[0].outputs[0].text
diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/benchmarks/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/reasoning_parsers/__init__.py
rename to tests/benchmarks/__init__.py
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
new file mode 100644
index 0000000000000..8537459b9f94d
--- /dev/null
+++ b/tests/benchmarks/test_latency_cli.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
+        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
new file mode 100644
index 0000000000000..b746d6b7853c9
--- /dev/null
+++ b/tests/benchmarks/test_serve_cli.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.benchmark
+def test_bench_serve(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
new file mode 100644
index 0000000000000..2045b36293565
--- /dev/null
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_throughput():
+    command = [
+        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
+        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 7307f44b6184e..d4551b1cc3aec 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -63,7 +63,8 @@ class LlamaConfig:
             factors.append((k, v))
         factors.sort()
         import hashlib
-        return hashlib.md5(str(factors).encode()).hexdigest()
+        return hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
 
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 3a45c35442ca8..579133ec0c3f6 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,21 +2,20 @@
 
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Optional, Union
 
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test
 
 
-@pytest.fixture(params=None, name="model_info")
-def models_list_fixture(request):
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -33,51 +32,57 @@ def models_list_fixture(request):
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
-    if is_quant_method_supported("aqlm"):
-        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-            "quantization": "aqlm"
-        }))
-
-    # TODO: figure out why this fails.
-    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-            "quantization": "gguf"
-        }))
-
-    if is_quant_method_supported("gptq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-            "quantization": "gptq"
-        }))
-
-    if is_quant_method_supported("gptq_marlin"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-            "quantization": "gptq_marlin"
-        }))
-
-    if is_quant_method_supported("gptq_marlin_24"):
-        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-            "quantization": "gptq_marlin_24"
-        }))
-
-    if is_quant_method_supported("marlin"):
-        TEST_MODELS.append(
-            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                "quantization": "marlin"
+    if all:
+        if is_quant_method_supported("aqlm"):
+            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+                "quantization": "aqlm"
             }))
 
-    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-            "quantization": "AWQ"
-        }))
+        # TODO: figure out why this fails.
+        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+                "quantization": "gguf"
+            }))
 
-    return TEST_MODELS
+        if is_quant_method_supported("gptq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+                "quantization": "gptq"
+            }))
+
+        if is_quant_method_supported("gptq_marlin"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+                "quantization": "gptq_marlin"
+            }))
+
+        if is_quant_method_supported("gptq_marlin_24"):
+            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+                "quantization": "gptq_marlin_24"
+            }))
+
+        if is_quant_method_supported("marlin"):
+            TEST_MODELS.append(
+                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                    "quantization": "marlin"
+                }))
+
+        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+                "quantization": "AWQ"
+            }))
+
+    if keywords is None:
+        return TEST_MODELS
+
+    # filter by keywords
+    pred = lambda model: any(keyword in model[0] for keyword in keywords)
+    return list(filter(pred, TEST_MODELS))
 
 
 @pytest.mark.parametrize(
     "optimization_level",
     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
-@pytest.mark.parametrize("model_info", "", indirect=True)
+@pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
@@ -91,25 +96,60 @@ def test_full_graph(
         m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
         print(f"MODEL={model}")
 
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0)
-        llm = LLM(
-            model=model,
-            enforce_eager=True,
-            tensor_parallel_size=1,
-            disable_custom_all_reduce=True,
-            compilation_config=optimization_level,
-            **model_kwargs,
-        )
-        outputs = llm.generate(prompts, sampling_params)
+        run_model(optimization_level, model, model_kwargs)
 
-        # Print the outputs.
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+PassConfig = CompilationConfig.PassConfig
+
+
+# TODO(luka) add other supported compilation config scenarios here
+@pytest.mark.parametrize(
+    "compilation_config, model_info",
+    [
+        # additional compile sizes, only some of the models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           compile_sizes=[1, 2]), model)
+        for model in models_list(all=False)
+    ] + [
+        # RMSNorm + quant fusion, only 8-bit quant models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           custom_ops=["+rms_norm"],
+                           pass_config=PassConfig(enable_fusion=True,
+                                                  enable_noop=True)), model)
+        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
+    ])
+# only test some of the models
+@create_new_process_for_each_test()
+def test_custom_compile_config(
+    compilation_config: CompilationConfig,
+    model_info: tuple[str, dict[str, Any]],
+):
+    model, model_kwargs = model_info
+    print(f"MODEL={model}")
+    run_model(compilation_config, model, model_kwargs)
+
+
+def run_model(compile_config: Union[int, CompilationConfig], model: str,
+              model_kwargs: dict[str, Any]):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(
+        model=model,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        compilation_config=compile_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index aaf0277810907..efebf05b6b047 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -2,7 +2,6 @@
 
 import pytest
 import torch
-from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
 import vllm.plugins
@@ -14,9 +13,12 @@ from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 
 class TestModel(torch.nn.Module):
 
@@ -42,12 +44,17 @@ class TestModel(torch.nn.Module):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
+        x2 = self.fp8_linear.apply(y,
+                                   self.w[0],
+                                   self.wscale[0],
+                                   input_scale=self.scale[0])
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
-                                   self.scale[1])
+        x3 = self.fp8_linear.apply(y2,
+                                   self.w[1],
+                                   self.wscale[1],
+                                   input_scale=self.scale[1])
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -59,8 +66,8 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("static", [True, False])
 @pytest.mark.parametrize("cutlass_fp8_enabled",
                          [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
                               cutlass_fp8_enabled):
     torch.set_default_device("cuda")
diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml
similarity index 100%
rename from tests/data/test_config.yaml
rename to tests/config/test_config.yaml
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
new file mode 100644
index 0000000000000..d8c8c7bc8162a
--- /dev/null
+++ b/tests/config/test_config_with_model.yaml
@@ -0,0 +1,7 @@
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/tests/conftest.py b/tests/conftest.py
index cc48fceb8eff0..25e70319e2cc8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -29,12 +29,11 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         TokensPrompt, to_enc_dec_tuple_list,
-                         zip_enc_dec_prompts)
+                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import cuda_device_count_stateless, is_list_of
+from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -469,12 +468,19 @@ class HfRunner:
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
                                 num_beams=beam_width,
-                                num_return_sequences=beam_width)
+                                num_return_sequences=beam_width,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
+
         for i in range(len(outputs)):
             output_ids, output_str = outputs[i]
             for j in range(len(output_ids)):
@@ -671,8 +677,9 @@ class HfRunner:
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
-    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
-        return self.model.encode(prompts)
+    def encode(self, prompts: list[str], *args,
+               **kwargs) -> list[list[torch.Tensor]]:
+        return self.model.encode(prompts, *args, **kwargs)
 
     def predict(self, prompts: list[list[str]]) -> torch.Tensor:
         return self.model.predict(prompts, convert_to_tensor=True)
@@ -747,30 +754,27 @@ class VllmRunner:
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[TextPrompt]:
-        if images is not None:
-            assert len(prompts) == len(images)
 
-        if videos is not None:
-            assert len(prompts) == len(videos)
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")
 
-        if audios is not None:
-            assert len(prompts) == len(audios)
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio
 
-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                if image is not None:
-                    inputs[i]["multi_modal_data"] = {"image": image}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                if video is not None:
-                    inputs[i]["multi_modal_data"] = {"video": video}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                if audio is not None:
-                    inputs[i]["multi_modal_data"] = {"audio": audio}
+            inputs.append(
+                TextPrompt(prompt=prompt,
+                           multi_modal_data=multi_modal_data
+                           if multi_modal_data else None))
 
         return inputs
 
@@ -921,6 +925,7 @@ class VllmRunner:
         max_tokens: int,
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
     ) -> Union[list[TokensTextLogprobs],
                list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -928,6 +933,7 @@ class VllmRunner:
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
@@ -938,18 +944,20 @@ class VllmRunner:
 
     def generate_beam_search(
         self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str],
         beam_width: int,
         max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        if is_list_of(prompts, str, check="all"):
-            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
-        else:
-            prompts = [
-                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
-            ]
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
         outputs = self.model.beam_search(
-            prompts,
+            inputs,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
         returned_outputs = []
         for output in outputs:
@@ -962,19 +970,19 @@ class VllmRunner:
         req_outputs = self.model.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
-    def encode(
-        self,
-        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[list[float]]:
+    def encode(self,
+               prompts: list[str],
+               images: Optional[PromptImageInput] = None,
+               videos: Optional[PromptVideoInput] = None,
+               audios: Optional[PromptAudioInput] = None,
+               *args,
+               **kwargs) -> list[list[float]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.embed(inputs)
+        req_outputs = self.model.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
@@ -1120,3 +1128,15 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "optional" in item.keywords:
             item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index e9b537ed5150e..9e8e315d87b18 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -195,15 +195,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                          [{
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 2,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 3,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 256,
                              "max_num_seqs": 10,
                          }])
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index e23b8718cb632..039b5e739892a 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -129,12 +129,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     check_answers(indices, answer, test_texts)
 
 
-def prep_prompts(batch_size: int):
+def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     """
     Generate prompts which a bunch of assignments,
     then asking for the value of one of them.
     The prompt is just under 10k tokens; sliding window is 4k
     so the answer is outside sliding window, but should still be correct.
+
+    Args:
+        batch_size: number of prompts to generate
+        ln_range: an argument to control the length of the prompt
     """
     prompts: list[str] = []
     answer: list[int] = []
@@ -145,7 +149,7 @@ def prep_prompts(batch_size: int):
         indices.append(idx)
         prompt = "```python\n# We set a number of variables, " + \
                  f"x{idx} will be important later\n"
-        ln = random.randint(800, 1100)
+        ln = random.randint(*ln_range)
         for k in range(30, ln):
             v = random.randint(10, 99)
             if k == idx:
@@ -157,7 +161,10 @@ def prep_prompts(batch_size: int):
     return prompts, answer, indices
 
 
-def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
+def check_answers(indices: list[int],
+                  answer: list[int],
+                  outputs: list[str],
+                  accept_rate: float = 0.7):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -166,7 +173,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
             numok += 1
     frac_ok = numok / len(answer)
     print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok > 0.7
+    assert frac_ok >= accept_rate
 
 
 def check_window(prompts: list[str]):
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index bfa7d06c4d075..a7ba45c9e546e 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -106,7 +106,7 @@ def eager_allreduce(
         # communicate independently
         num_communication = rank // tp_size + 1
         sz = 1024
-        fa = get_tp_group().ca_comm
+        fa = get_tp_group().device_communicator.ca_comm
         inp = torch.ones(sz, dtype=torch.float32, device=device)
         out = inp
         for _ in range(num_communication):
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index e757db45c8cf5..05e30f855ced2 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -175,7 +175,7 @@ TEXT_GENERATION_MODELS = {
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
-    # Tests TransformersModel
+    # Tests TransformersForCausalLM
     "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
@@ -217,7 +217,7 @@ EMBEDDING_MODELS = {  # type: ignore[var-annotated]
 
 MULTIMODAL_MODELS = {
     # [Decoder-only]
-    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
     "THUDM/glm-4v-9b": PPTestSettings.fast(),
@@ -245,7 +245,7 @@ TEST_MODELS = [
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
     "meta-llama/Llama-3.2-1B-Instruct",
-    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
+    "ArthurZ/Ilama-3.2-1B",
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 8698d124e73ff..7902011519d90 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from argparse import ArgumentTypeError
+from argparse import ArgumentError, ArgumentTypeError
 
 import pytest
 
@@ -10,7 +10,7 @@ from vllm.utils import FlexibleArgumentParser
 
 
 @pytest.mark.parametrize(("arg", "expected"), [
-    (None, None),
+    (None, dict()),
     ("image=16", {
         "image": 16
     }),
@@ -24,6 +24,10 @@ from vllm.utils import FlexibleArgumentParser
     }),
 ])
 def test_limit_mm_per_prompt_parser(arg, expected):
+    """This functionality is deprecated and will be removed in the future.
+    This argument should be passed as JSON string instead.
+    
+    TODO: Remove with nullable_kvs."""
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
         args = parser.parse_args([])
@@ -53,12 +57,20 @@ def test_compilation_config():
     assert args.compilation_config.level == 3
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config",
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config={'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config="
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
 
 def test_prefix_cache_default():
@@ -142,3 +154,39 @@ def test_composite_arg_parser(arg, expected, option):
     else:
         args = parser.parse_args([f"--{option}", arg])
     assert getattr(args, option.replace("-", "_")) == expected
+
+
+def test_human_readable_model_len():
+    # `exit_on_error` disabled to test invalid values below
+    parser = EngineArgs.add_cli_args(
+        FlexibleArgumentParser(exit_on_error=False))
+
+    args = parser.parse_args([])
+    assert args.max_model_len is None
+
+    args = parser.parse_args(["--max-model-len", "1024"])
+    assert args.max_model_len == 1024
+
+    # Lower
+    args = parser.parse_args(["--max-model-len", "1m"])
+    assert args.max_model_len == 1_000_000
+    args = parser.parse_args(["--max-model-len", "10k"])
+    assert args.max_model_len == 10_000
+
+    # Capital
+    args = parser.parse_args(["--max-model-len", "3K"])
+    assert args.max_model_len == 1024 * 3
+    args = parser.parse_args(["--max-model-len", "10M"])
+    assert args.max_model_len == 2**20 * 10
+
+    # Decimal values
+    args = parser.parse_args(["--max-model-len", "10.2k"])
+    assert args.max_model_len == 10200
+    # ..truncated to the nearest int
+    args = parser.parse_args(["--max-model-len", "10.212345k"])
+    assert args.max_model_len == 10212
+
+    # Invalid (do not allow decimals with binary multipliers)
+    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
+        with pytest.raises(ArgumentError):
+            args = parser.parse_args(["--max-model-len", invalid])
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index d5111e3fda8fd..b29d6362f571b 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -18,7 +18,8 @@ models = ["llava-hf/llava-1.5-7b-hf"]
 def test_context_length_too_short(vllm_runner, image_assets, model):
     images = [asset.pil_image for asset in image_assets]
 
-    with pytest.raises(ValueError, match="too long to fit into the model"):
+    with pytest.raises(ValueError,
+                       match="longer than the maximum model length"):
         vllm_model = vllm_runner(
             model,
             max_model_len=128,  # LLaVA has a feature size of 576
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 77fbb5827da9e..95657455bd7bb 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -13,18 +13,24 @@ import pytest
 
 from vllm.platforms import current_platform
 
-MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+MODEL_NAMES = [
+    "Qwen/Qwen2-1.5B-Instruct",
+    "google/gemma-3-1b-it",
+]
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUES = {
+    "Qwen/Qwen2-1.5B-Instruct": 0.58,
+    "google/gemma-3-1b-it": 0.25,
+}
 
 
-def run_test(more_args=None):
+def run_test(model_name, more_args=None):
     """Run the end to end accuracy test."""
 
-    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+    model_args = f"pretrained={model_name},max_model_len=4096"
 
     if more_args is not None:
         model_args = "{},{}".format(model_args, more_args)
@@ -37,9 +43,12 @@ def run_test(more_args=None):
     )
 
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}")
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 
 # TODO: [AlexM] Fix it with new CI/CD tests
@@ -49,7 +58,8 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -58,13 +68,13 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
         more_args = None
         if current_platform.is_tpu():
             # Limit compilation time for TPU V1
-            more_args = "max_num_seqs=64"
+            more_args = "max_model_len=2048,max_num_seqs=64"
 
             # Add TP test (if provided)
             if TPU_TP_TEST_STR:
                 more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(more_args)
+        run_test(model, more_args)
 
 
 def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
@@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
-        run_test()
+        run_test("Qwen/Qwen2-1.5B-Instruct")
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 90e1d58141378..099af0f36088b 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -23,7 +23,19 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 
 @pytest.fixture(scope="module")
-def llm():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def llm(request, monkeypatch_module):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 5f1a91cb2b19f..e43e9826e8f9b 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -3,6 +3,7 @@
 import json
 import re
 import weakref
+from enum import Enum
 
 import jsonschema
 import pytest
@@ -15,7 +16,10 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+    "outlines",
+    "lm-format-enforcer",
+    "xgrammar:disable-any-whitespace",
+    "guidance:disable-any-whitespace",
 ]
 
 
@@ -282,15 +286,26 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
 
 @pytest.mark.skip_global_cleanup
 def test_disable_guided_decoding_fallback(sample_regex, llm):
+    # see has_xgrammar_unsupported_json_features()
+    unsupported_json = {
+        "type": "object",
+        "properties": {
+            "example": {
+                "type": "string",
+                "minLength": 5  # unsupported by xgrammar
+            }
+        }
+    }
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
+                                         json=unsupported_json,
                                          backend="xgrammar:no-fallback"))
 
     with pytest.raises(
             ValueError,
-            match="xgrammar does not support regex guided decoding"):
+            match="xgrammar does not support advanced JSON schema features "
+            "like enums, patterns or numeric ranges."):
         llm.generate(prompts="This should fail",
                      sampling_params=sampling_params,
                      use_tqdm=True)
@@ -322,59 +337,50 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             print(generated_text)
             assert generated_text is not None
 
+            if 'disable-any-whitespace' in guided_decoding_backend:
+                assert "\n" not in generated_text
+
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
 
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
 @pytest.mark.skip_global_cleanup
-def test_json_with_any_whitespace_disabled(llm):
-
-    class ResponseSchema(BaseModel):
-        clarifying_question: str
-        cost_per_serving: str
-        calories: str
-        type_dish_ids: str
-        type_meal_ids: str
-        product_ids: list[str]
-        exclude_product_ids: list[str]
-        allergen_ids: list[str]
-        total_cooking_time: str
-        kitchen_ids: str
-        holiday_ids: str
-
-    # Note: Without this setting, the response is sometimes full of `\n`
-    # for some models. This option prevents that.
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    schema = ResponseSchema.model_json_schema()
-    guided_params = GuidedDecodingParams(json=schema,
-                                         backend=\
-                                           guided_decoding_backend)
-    sampling_params = SamplingParams(max_tokens=2000,
-                                     frequency_penalty=0,
-                                     presence_penalty=-1.1,
-                                     repetition_penalty=1.3,
-                                     guided_decoding=guided_params)
-
-    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
-              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
-              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
-    outputs = llm.generate(prompts=prompt,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
 
     assert outputs is not None
-
     for output in outputs:
         assert output is not None
         assert isinstance(output, RequestOutput)
+        prompt = output.prompt
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        assert "\n" not in generated_text
-
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 61bd1d462a50f..665c6ea1e6994 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -15,7 +15,7 @@ def v1(run_with_both_engines):
 
 def test_empty_prompt():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
         llm.generate([""])
 
 
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index eca5d184f5d60..642c204b9ff00 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
                          expected_wer,
                          n_examples=-1,
                          max_concurrent_request=None):
+    # TODO refactor to use `ASRDataset`
     with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
         dataset = load_hf_dataset(dataset_repo)
 
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 3267dcc15e4a9..72e616656775e 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -12,7 +14,9 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 TEST_AUDIO_URLS = [
     AudioAsset("winning_call").url,
+    AudioAsset("mary_had_lamb").url,
 ]
+MAXIMUM_AUDIOS = 2
 
 
 @pytest.fixture(scope="module")
@@ -24,6 +28,8 @@ def server():
         "5",
         "--enforce-eager",
         "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -46,7 +52,7 @@ def base64_encoded_audio() -> dict[str, str]:
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
                                          model_name: str, audio_url: str):
     messages = [{
@@ -100,7 +106,36 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
         base64_encoded_audio: dict[str, str]):
@@ -158,7 +193,7 @@ async def test_single_chat_session_audio_base64encoded(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_input_audio(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
         base64_encoded_audio: dict[str, str]):
@@ -330,28 +365,21 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize(
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str,
-                                 base64_encoded_audio: dict[str, str]):
+                                 audio_urls: list[str]):
 
     messages = [{
         "role":
         "user",
         "content": [
-            {
+            *({
                 "type": "audio_url",
                 "audio_url": {
                     "url": audio_url
                 }
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
+            } for audio_url in audio_urls),
             {
                 "type": "text",
                 "text": "What's happening in this audio?"
@@ -359,20 +387,30 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
         ],
     }]
 
-    with pytest.raises(openai.BadRequestError):  # test multi-audio input
-        await client.chat.completions.create(
+    if len(audio_urls) > MAXIMUM_AUDIOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-audio input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
             model=model_name,
             messages=messages,
             max_completion_tokens=10,
             temperature=0.0,
         )
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 25e4595cef6f6..a10b42ea3a4b5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -11,7 +11,7 @@ import pytest
 import pytest_asyncio
 import requests
 import torch
-from openai import BadRequestError
+from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
@@ -20,11 +20,25 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
-
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server(
+        request,
+        monkeypatch_module,
+        zephyr_lora_files,  #noqa: F811
+        zephyr_lora_added_tokens_files):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -49,6 +63,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
         yield remote_server
 
 
+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -464,14 +485,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     assert last_completion_tokens == 10
 
 
-# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
-# (i.e. using the same ordering as in the Completions API tests), the test
-# will fail on the second `guided_decoding_backend` even when I swap their order
-# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str,
                                   sample_guided_choice):
     messages = [{
         "role": "system",
@@ -487,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_guided_choice
 
@@ -502,18 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_guided_choice
     assert choice1 != choice2
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                guided_decoding_backend: str,
                                 sample_json_schema):
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -528,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -546,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -557,9 +567,8 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 guided_decoding_backend: str, sample_regex):
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -573,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -585,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -615,10 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           guided_decoding_backend: str,
                                            sample_guided_choice):
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -634,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -647,10 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI,
-                              guided_decoding_backend: str,
-                              sample_json_schema):
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -682,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
+    )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
@@ -717,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
         stream=True)
 
     output = []
@@ -740,48 +741,128 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
-                                                   sample_json_schema):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_required_tool_use(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool, model_name: str):
+    if is_v1_server:
+        pytest.skip(
+            "tool_choice='required' requires features unsupported on V1")
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="required")
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "unit"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="auto")
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+
+    # Non-streaming test
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+    )
+
+    assert chat_completion.choices[0].message.tool_calls is not None
+    assert len(chat_completion.choices[0].message.tool_calls) > 0
+
+    # Streaming test
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        stream=True,
+    )
+
+    output = []
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            output.extend(chunk.choices[0].delta.tool_calls)
+
+    assert len(output) > 0
 
 
 @pytest.mark.asyncio
@@ -1000,7 +1081,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
     url = f"http://localhost:{server.port}/v1/chat/completions"
     headers = {
         "Content-Type": "application/json",
@@ -1021,10 +1102,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
     response = requests.post(url, headers=headers, json=data)
     response_data = response.json()
     print(response_data)
-
+    assert response_data.get("model") == MODEL_NAME
     choice = response_data.get("choices")[0]
     message = choice.get("message")
     assert message is not None
     content = message.get("content")
     assert content is not None
     assert len(content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, ""])
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
+                                                   model_name: str):
+
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
new file mode 100644
index 0000000000000..9dab524ea4801
--- /dev/null
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_valid(client):
+    """Test that valid logit_bias values are accepted in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    valid_token_id = vocab_size - 1
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "Testing valid logit bias"
+        }],
+        max_tokens=5,
+        logit_bias={str(valid_token_id): 1.0},
+    )
+
+    assert completion.choices[0].message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_invalid(client):
+    """Test that invalid logit_bias values are rejected in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    invalid_token_id = vocab_size + 1
+
+    with pytest.raises(openai.BadRequestError) as excinfo:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": "Testing invalid logit bias"
+            }],
+            max_tokens=5,
+            logit_bias={str(invalid_token_id): 1.0},
+        )
+
+    error = excinfo.value
+    error_message = str(error)
+
+    assert error.status_code == 400
+    assert str(invalid_token_id) in error_message
+    assert str(vocab_size) in error_message
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 0d1c936da7597..2cdeb684f75de 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,6 +11,7 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+from ...models.embedding.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
@@ -190,30 +191,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
     responses_float = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="float")
+    float_data = [d.embedding for d in responses_float.data]
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
                                                       encoding_format="base64")
-
-    decoded_responses_base64_data = []
+    base64_data = []
     for data in responses_base64.data:
-        decoded_responses_base64_data.append(
+        base64_data.append(
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    assert responses_float.data[0].embedding == decoded_responses_base64_data[
-        0]
-    assert responses_float.data[1].embedding == decoded_responses_base64_data[
-        1]
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=base64_data,
+        name_0="float",
+        name_1="base64",
+    )
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
+    default_data = [d.embedding for d in responses_default.data]
 
-    assert responses_float.data[0].embedding == responses_default.data[
-        0].embedding
-    assert responses_float.data[1].embedding == responses_default.data[
-        1].embedding
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=default_data,
+        name_0="float",
+        name_1="default",
+    )
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
new file mode 100644
index 0000000000000..43d109f74f5da
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
+"""
+
+import openai
+import pytest
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...models.embedding.utils import EmbedModelInfo
+from ...utils import RemoteOpenAIServer
+
+MODELS = [
+    EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
+    EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+]
+
+input_texts = [
+    "The chef prepared a delicious meal.",
+] * 3
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+async def test_validating_dimensions(model: EmbedModelInfo):
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--trust_remote_code"
+    ]
+    with RemoteOpenAIServer(model.name, args) as remote_server:
+        client = remote_server.get_async_client()
+
+        async def make_request(dimensions):
+            embedding_response = await client.embeddings.create(
+                model=model.name,
+                input=input_texts,
+                dimensions=dimensions,
+                encoding_format="float",
+            )
+            embeddings = EmbeddingResponse.model_validate(
+                embedding_response.model_dump(mode="json"))
+
+            assert embeddings.id is not None
+            assert len(embeddings.data) == 3
+            assert len(embeddings.data[0].embedding) > 0
+            assert embeddings.usage.completion_tokens == 0
+            assert embeddings.usage.prompt_tokens > 0
+            assert embeddings.usage.total_tokens > 0
+
+            if dimensions is not None:
+                assert len(embeddings.data[0].embedding) == dimensions
+
+        if model.is_matryoshka:
+            for dimensions in [None, 16]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1]:
+                    await make_request(dimensions)
+
+        else:
+            for dimensions in [None]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1, 16]:
+                    await make_request(dimensions)
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 1a62157acc478..2fc08b47513e6 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,7 +53,20 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server_with_lora_modules_json(request, monkeypatch_module,
+                                  zephyr_lora_files):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
new file mode 100644
index 0000000000000..c96151349eb3f
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+MOCK_RESOLVER_NAME = "mock_test_resolver"
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    """Minimal mock ModelConfig for testing."""
+    model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
+    trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
+    max_model_len: int = 100
+    tokenizer_revision: Optional[str] = None
+    multimodal_config: MultiModalConfig = field(
+        default_factory=MultiModalConfig)
+    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processor_pattern: Optional[str] = None
+    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
+    generation_config: str = "auto"
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+class MockLoRAResolver(LoRAResolver):
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test-lora":
+            return LoRARequest(lora_name="test-lora",
+                               lora_int_id=1,
+                               lora_local_path="/fake/path/test-lora")
+        elif lora_name == "invalid-lora":
+            return LoRARequest(lora_name="invalid-lora",
+                               lora_int_id=2,
+                               lora_local_path="/fake/path/invalid-lora")
+        return None
+
+
+@pytest.fixture(autouse=True)
+def register_mock_resolver():
+    """Fixture to register and unregister the mock LoRA resolver."""
+    resolver = MockLoRAResolver()
+    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
+    yield
+    # Cleanup: remove the resolver after the test runs
+    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
+        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
+
+
+@pytest.fixture
+def mock_serving_setup():
+    """Provides a mocked engine and serving completion instance."""
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    def mock_add_lora_side_effect(lora_request: LoRARequest):
+        """Simulate engine behavior when adding LoRAs."""
+        if lora_request.lora_name == "test-lora":
+            # Simulate successful addition
+            return
+        elif lora_request.lora_name == "invalid-lora":
+            # Simulate failure during addition (e.g. invalid format)
+            raise ValueError(f"Simulated failure adding LoRA: "
+                             f"{lora_request.lora_name}")
+
+    mock_engine.add_lora.side_effect = mock_add_lora_side_effect
+    mock_engine.generate.reset_mock()
+    mock_engine.add_lora.reset_mock()
+
+    mock_model_config = MockModelConfig()
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+
+    serving_completion = OpenAIServingCompletion(mock_engine,
+                                                 mock_model_config,
+                                                 models,
+                                                 request_logger=None)
+
+    return mock_engine, serving_completion
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_with_lora_resolver(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    # Suppress potential errors during the mocked generate call,
+    # as we are primarily checking for add_lora and generate calls
+    with suppress(Exception):
+        await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+    mock_engine.generate.assert_called_once()
+    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_not_found(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    non_existent_model = "non-existent-lora-adapter"
+    req = CompletionRequest(
+        model=non_existent_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
+
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_add_lora_fails(
+        mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    invalid_model = "invalid-lora"
+    req = CompletionRequest(
+        model=invalid_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    # Assert add_lora was called before the failure
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == invalid_model
+
+    # Assert generate was *not* called due to the failure
+    mock_engine.generate.assert_not_called()
+
+    # Assert the correct error response
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_flag_not_set(mock_serving_setup):
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 2bffd0ce138e6..42f7b098f917d 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -13,9 +13,12 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from vllm import version
+
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+PREV_MINOR_VERSION = version._prev_minor_version()
 
 
 @pytest.fixture(scope="module", params=[True, False])
@@ -55,6 +58,7 @@ def default_server_args():
                     "",
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
+                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
                 ])
 def server(use_v1, default_server_args, request):
     if request.param:
@@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
+                or (not server.show_hidden_metrics
+                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
             continue
 
         found_metric = False
@@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 EXPECTED_METRICS = [
     "vllm:num_requests_running",
-    "vllm:num_requests_swapped",
+    "vllm:num_requests_swapped",  # deprecated
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",  # deprecated
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
+HIDDEN_DEPRECATED_METRICS = [
+    "vllm:num_requests_swapped",
+    "vllm:cpu_cache_usage_perc",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
@@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     assert response.status_code == HTTPStatus.OK
 
     for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        assert metric in response.text
+        if (not server.show_hidden_metrics
+                and metric not in HIDDEN_DEPRECATED_METRICS):
+            assert metric in response.text
 
 
 def test_metrics_exist_run_batch(use_v1: bool):
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
new file mode 100644
index 0000000000000..1ccb803a328d6
--- /dev/null
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import schemathesis
+from schemathesis import GenerationConfig
+
+from ...utils import RemoteOpenAIServer
+
+schemathesis.experimental.OPEN_API_3_1.enable()
+
+MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
+MAXIMUM_IMAGES = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def get_schema(server):
+    # avoid generating null (\x00) bytes in strings during test case generation
+    return schemathesis.openapi.from_uri(
+        f"{server.url_root}/openapi.json",
+        generation_config=GenerationConfig(allow_x00=False),
+    )
+
+
+schema = schemathesis.from_pytest_fixture("get_schema")
+
+
+@schema.parametrize()
+@schema.override(headers={"Content-Type": "application/json"})
+async def test_openapi_stateless(case):
+    #No need to verify SSL certificate for localhost
+    await case.call_and_validate(verify=False)
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 64a1eb6a63eef..f889189a99681 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -17,7 +17,7 @@ async def test_empty_prompt():
         client = remote_server.get_async_client()
 
         with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.+Prompt cannot be empty.+')):
+                           match="decoder prompt cannot be empty"):
             await client.completions.create(model=model_name,
                                             prompt="",
                                             max_tokens=5,
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 8bdf00bcee126..3ca8a9a410ffd 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,15 +25,37 @@ def test_sleep_mode():
                                 "VLLM_SERVER_DEV_MODE": "1",
                                 "CUDA_VISIBLE_DEVICES": "0"
                             }) as remote_server:
-        response = requests.post(remote_server.url_for("/sleep"),
-                                 data={"level": "1"})
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
-        response = requests.post(remote_server.url_for("/wake_up"))
+        response = requests.post(remote_server.url_for("wake_up"))
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["weights"]})
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["kv_cache"]})
+        assert response.status_code == 200
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 29571bcd7649b..5c48df3cebbc2 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -192,3 +192,36 @@ async def test_stream_options(winning_call):
                 else:
                     continuous = continuous and hasattr(chunk, 'usage')
             assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked. 
+    """
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.8,
+            extra_body=dict(seed=42,
+                            repetition_penalty=1.9,
+                            top_k=12,
+                            top_p=0.4,
+                            min_p=0.5,
+                            frequency_penalty=1.8,
+                            presence_penalty=2.0))
+
+        greedy_transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.0,
+            extra_body=dict(seed=42))
+
+        assert greedy_transcription.text != transcription.text
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index f9ccce9c1c332..53f057a294c0a 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -31,7 +33,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"video={MAXIMUM_VIDEOS}",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -106,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": video_url
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index bb100e573b878..1ab50b41c7ecb 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,8 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
+import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
@@ -32,7 +37,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        json.dumps({"image": MAXIMUM_IMAGES}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -53,11 +58,31 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|>\n"
+    messages = [{
+        "role": "user",
+        "content": f"{placeholder}{content}",
+    }]
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -70,16 +95,17 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -87,8 +113,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -107,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": image_url
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@@ -150,6 +210,7 @@ async def test_single_chat_session_image_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, image_url: str,
         base64_encoded_image: dict[str, str]):
 
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -163,16 +224,17 @@ async def test_single_chat_session_image_base64encoded(
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -180,8 +242,12 @@ async def test_single_chat_session_image_base64encoded(
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 74e5c4cc7ea4a..26c68e06c199f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,7 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import pytest
 import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -35,7 +39,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        json.dumps({"image": MAXIMUM_IMAGES}),
         "--chat-template",
         str(vlm2vec_jinja_path),
     ]
@@ -52,11 +56,24 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                                image_url: str):
+    content_text = "Represent the given image."
     messages = [{
         "role":
         "user",
@@ -69,7 +86,7 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
             },
             {
                 "type": "text",
-                "text": "Represent the given image."
+                "text": content_text
             },
         ],
     }]
@@ -85,9 +102,12 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     response.raise_for_status()
     embeddings = EmbeddingResponse.model_validate(response.json())
 
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 763
-    assert embeddings.usage.total_tokens == 763
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 6efed990b1893..92c1e0fec6b74 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -9,11 +9,11 @@ from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
-                                         _try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          parse_chat_messages,
                                          parse_chat_messages_futures,
-                                         resolve_chat_template_content_format)
+                                         resolve_chat_template_content_format,
+                                         resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
@@ -25,11 +25,13 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
 
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
+MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
 
 @pytest.fixture(scope="function")
@@ -80,6 +82,30 @@ def mllama_tokenizer():
     )
 
 
+@pytest.fixture(scope="function")
+def mistral_model_config():
+    return ModelConfig(MISTRAL_MODEL_ID,
+                       task="generate",
+                       tokenizer=MISTRAL_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=MISTRAL_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset('cherry_blossom')
@@ -131,6 +157,66 @@ def test_parse_chat_messages_single_image(
     _assert_mm_data_is_image_input(mm_data, 1)
 
 
+def test_parse_chat_messages_empty_system(
+    mistral_model_config,
+    mistral_tokenizer,
+):
+    # Test string format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="string",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": ""
+    }, {
+        "role": "user",
+        "content": "Who are you?"
+    }]
+
+    # Test openai format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": [{
+            "type": "text",
+            "text": ""
+        }]
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "Who are you?"
+        }]
+    }]
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_async(
     phi3v_model_config,
@@ -671,7 +757,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     # Build a config for the model
     model_config = ModelConfig(model,
                                task="generate",
-                               tokenizer=MLLAMA_MODEL_ID,
+                               tokenizer=model,
                                tokenizer_mode="auto",
                                trust_remote_code=True,
                                dtype="auto",
@@ -682,7 +768,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
-        MLLAMA_MODEL_ID,
+        model,
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
@@ -747,7 +833,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     }] if use_tools else None
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=tools,
@@ -756,6 +842,8 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     assert isinstance(chat_template, str)
 
 
+# NOTE: Qwen2-Audio default chat template is specially defined inside
+# processor class instead of using `tokenizer_config.json`
 # yapf: disable
 @pytest.mark.parametrize(
     ("model", "expected_format"),
@@ -763,6 +851,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
      (QWEN2VL_MODEL_ID, "openai"),
      (QWEN25VL_MODEL_ID, "openai"),
      (ULTRAVOX_MODEL_ID, "string"),
+     (QWEN2AUDIO_MODEL_ID, "openai"),
      (MLLAMA_MODEL_ID, "openai"),
      (LLAMA_GUARD_MODEL_ID, "openai")],
 )
@@ -781,7 +870,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     tokenizer = tokenizer_group.tokenizer
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=None,
@@ -815,10 +904,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_chatglm2.jinja", "string"),
      ("template_chatml.jinja", "string"),
      ("template_deepseek_vl2.jinja", "string"),
+     ("template_dse_qwen2_vl.jinja", "openai"),
      ("template_falcon_180b.jinja", "string"),
      ("template_falcon.jinja", "string"),
+     ("template_florence2.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_teleflm.jinja", "string"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),
diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py
similarity index 100%
rename from tests/kernels/conftest.py
rename to tests/kernels/attention/conftest.py
diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py
similarity index 99%
rename from tests/kernels/test_attention.py
rename to tests/kernels/attention/test_attention.py
index 0d7898a900e48..e5650136f2584 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -6,13 +6,12 @@ from typing import Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 if not current_platform.is_rocm():
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
new file mode 100644
index 0000000000000..b0414244c2151
--- /dev/null
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+# Define MLA and non-MLA backends separately
+DEVICE_MLA_BACKENDS = {
+    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
+    "cpu": [],
+}
+
+DEVICE_REGULAR_ATTN_BACKENDS = {
+    "cuda": ["XFORMERS", "FLASHINFER"],
+    "hip": ["ROCM_FLASH"],
+    "cpu": ["TORCH_SDPA"],
+}
+
+DEVICE_MLA_BLOCK_SIZES = {
+    "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
+    "hip": [16, 1],  # HIP requires special handling for block_size=1
+    "cpu": [16]  # CPU uses fixed block size from test cases
+}
+
+
+def generate_params():
+    params = []
+    for use_mla in [True, False]:
+        for device in ["cuda", "hip", "cpu"]:
+            backends = DEVICE_MLA_BACKENDS[
+                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            for name in backends:
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
+                    16
+                ]
+                for block_size in block_sizes:
+                    params.append(
+                        pytest.param(
+                            device,
+                            name,
+                            use_mla,
+                            block_size,
+                            id=
+                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
+                        ))
+    return params
+
+
+@pytest.mark.parametrize("device, name, use_mla, block_size",
+                         generate_params())
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_env(
+    device: str,
+    name: str,
+    use_mla: bool,
+    block_size: int,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with valid device-backend pairs."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           block_size, False)
+            assert backend.get_name() == "TORCH_SDPA"
+
+        elif device == "hip":
+            with patch("vllm.attention.selector.current_platform",
+                       RocmPlatform()):
+                if use_mla:
+                    # Validate HIP MLA backend-block_size combinations
+                    valid_combination = (
+                        (name == "TRITON_MLA" and block_size != 1)
+                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+
+                    if valid_combination:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == name
+                    else:
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    assert backend.get_name() == expected
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                if use_mla:
+                    if name == "FLASHMLA" and block_size == 64:
+                        from vllm.attention.backends.flashmla import (
+                            is_flashmla_supported)
+
+                        # only on cuda platforms with specific capability.
+                        is_supported, _ = is_flashmla_supported()
+
+                        if not is_supported:
+                            # if platform is not supported then skip this case.
+                            pytest.skip()
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            assert backend.get_name() == expected
+                    else:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = ("TRITON_MLA_VLLM_V1"
+                                    if use_v1 else "TRITON_MLA")
+                        assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+
+
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
+    """Test FlashAttn validation."""
+    # TODO: When testing for v1, pipe in `use_v1` as an argument to
+    # get_attn_backend
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+                            (7, 5))
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
+
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
+
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+
+        # Test with head size 32
+        backend = get_attn_backend(32, torch.float16, None, 16, False)
+        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
+        assert backend.get_name() == EXPECTED
+
+        # when block size == 16, backend will fall back to XFORMERS
+        # this behavior is not yet supported on V1.
+        if use_v1:
+            # TODO: support fallback on V1!
+            # https://github.com/vllm-project/vllm/issues/14524
+            pass
+        else:
+            backend = get_attn_backend(16, torch.float16, None, 16, False)
+            assert backend.get_name() == "XFORMERS"
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
similarity index 99%
rename from tests/kernels/test_blocksparse_attention.py
rename to tests/kernels/attention/test_blocksparse_attention.py
index 3025ae0f921a4..82d038257575c 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -6,14 +6,13 @@ from typing import Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py
similarity index 100%
rename from tests/kernels/test_cache.py
rename to tests/kernels/attention/test_cache.py
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
similarity index 100%
rename from tests/kernels/test_cascade_flash_attn.py
rename to tests/kernels/attention/test_cascade_flash_attn.py
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
similarity index 100%
rename from tests/kernels/test_encoder_decoder_attn.py
rename to tests/kernels/attention/test_encoder_decoder_attn.py
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
similarity index 100%
rename from tests/kernels/test_flash_attn.py
rename to tests/kernels/attention/test_flash_attn.py
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
similarity index 100%
rename from tests/kernels/test_flashinfer.py
rename to tests/kernels/attention/test_flashinfer.py
diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
similarity index 98%
rename from tests/kernels/test_flashmla.py
rename to tests/kernels/attention/test_flashmla.py
index 21c1079fc8eb3..3985c6834f60e 100644
--- a/tests/kernels/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     cal_diff(out_flash, out_torch, "out")
     cal_diff(lse_flash, lse_torch, "lse")
 
-    t = triton.testing.do_bench(flash_mla, fast_flush=False)
+    t = triton.testing.do_bench(flash_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
     bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
              b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
new file mode 100644
index 0000000000000..fbad52987dd2b
--- /dev/null
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.lightning_attn import (
+    linear_decode_forward_triton)
+from vllm.platforms import current_platform
+
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+
+
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+    
+    The difference from the main implementation is that this processes 
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    if ed.dim() == 1:
+        decay = torch.exp(-ed).view(1, -1, 1, 1)
+    else:
+        decay = torch.exp(-ed)
+
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
+                               dim=2)  # [B, H, 2, D, E]
+
+    return output, final_kv_cache
+
+
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+
+            # Update output and cache
+            output[b, h * D:(h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+
+    return output
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.arange(batch_size, device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    torch.testing.assert_close(triton_output,
+                               reference_output,
+                               rtol=1e-1,
+                               atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+
+    padding_mask = (slot_idx
+                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+
+    atol, rtol = 1.5e-1, 1.5e-1
+
+    valid_indices = slot_idx != -1
+
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(kv_caches[i],
+                                       kv_caches_copy[i],
+                                       rtol=rtol,
+                                       atol=atol)
+
+    torch.testing.assert_close(triton_masked,
+                               reference_masked,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    base = 0.01
+    q = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+
+    kv_history = base * torch.randn(batch_size,
+                                    num_heads,
+                                    head_size,
+                                    head_size,
+                                    dtype=dtype,
+                                    device="cuda")
+
+    kv_history_clone = kv_history.clone()
+
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history)
+
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone)
+
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache,
+                               actual_kv_cache,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
new file mode 100644
index 0000000000000..7038fbea5c22e
--- /dev/null
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
+from vllm.attention.ops.triton_merge_attn_states import (
+    merge_attn_states as merge_attn_states_triton)
+from vllm.platforms import current_platform
+
+
+# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states_torch(
+        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+):
+    p_lse = prefix_lse
+    s_lse = suffix_lse
+    # inf -> -inf
+    p_lse[p_lse == torch.inf] = -torch.inf
+    s_lse[s_lse == torch.inf] = -torch.inf
+    # max_lse [NUM_HEADS, NUM_TOKENS]
+    max_lse = torch.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    p_lse_exp = torch.exp(p_lse)
+    s_lse_exp = torch.exp(s_lse)
+    out_se = (p_lse_exp + s_lse_exp)
+    if output_lse is not None:
+        output_lse = torch.log(out_se) + max_lse
+    p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    p_scale = torch.transpose(p_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    output = prefix_output * p_scale + suffix_output * s_scale
+    return output, output_lse
+
+
+NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096]
+NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64]
+HEAD_SIZES = [32, 48, 64, 96, 128, 256]
+DTYPES = [torch.float32, torch.half, torch.bfloat16]
+
+all_case_info: list[tuple] = []
+
+
+def generate_markdown_table():
+    global all_case_info
+    table_header = ("| tokens | heads | headsize | dtype "
+                    "| device | torch | triton | cuda | speedup |")
+    table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+
+    def shortly_dtype(dtype: torch.dtype) -> str:
+        return str(dtype).removeprefix("torch.")
+
+    def shortly_device(device: str) -> str:
+        return device.removeprefix("NVIDIA").strip()
+
+    print(table_header)
+    print(table_separator)
+    for info in all_case_info:
+        (num_tokens, num_heads, head_size, dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved) = info
+        dtype = shortly_dtype(dtype)
+        device = shortly_device(device)
+        print(f"| {num_tokens} | {num_heads} | {head_size} "
+              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+              f"| {avg_time_triton_kernel:.5f}ms "
+              f"| {avg_time_cuda_kernel:.5f}ms "
+              f"| {performance_improved:.4f}x |")
+
+
+@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
+@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("output_dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_attn_states(num_tokens: int, num_query_heads: int,
+                           head_size: int, output_dtype: torch.dtype):
+    if not current_platform.is_cuda():
+        pytest.skip('Currently only support compare triton merge_attn_states '
+                    'with custom cuda merge_attn_states kernel')
+
+    NUM_TOKENS = num_tokens
+    NUM_HEADS = num_query_heads
+    HEAD_SIZE = head_size
+
+    print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+          f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+          f"Device: {current_platform.get_device_name()}")
+
+    # prefix_lse and suffix_lse contain inf and normal values
+    prefix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+
+    # Generate boolean masks
+    mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    # Ensure that the same position is not True at the same time
+    combined_mask = torch.logical_and(mask_prefix, mask_suffix)
+    mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
+    mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
+
+    prefix_lse[mask_prefix] = float('inf')
+    suffix_lse[mask_suffix] = float('inf')
+
+    # Other input tensors (need to be initialized but
+    # no actual calculation needed)
+    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                         dtype=output_dtype,
+                         device="cuda")
+    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
+                             dtype=torch.float32,
+                             device="cuda")
+    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+
+    warmup_times = 2
+    repeat_times = 20
+
+    output_torch = output.clone()
+    output_lse_torch = output_lse.clone()
+    total_time_torch_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    # 0. Run the Torch kernel
+    prefix_lse_torch = prefix_lse.clone()
+    suffix_lse_torch = suffix_lse.clone()
+    for _ in range(warmup_times):
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_torch_kernel += start.elapsed_time(end)
+
+    avg_time_torch_kernel = total_time_torch_kernel / repeat_times
+
+    # 1. Run the Triton kernel
+    output_ref_triton = output.clone()
+    output_lse_ref_triton = output_lse.clone()
+
+    total_time_triton_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    for _ in range(warmup_times):
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_triton_kernel += start.elapsed_time(end)
+
+    avg_time_triton_kernel = total_time_triton_kernel / repeat_times
+
+    # 2. Run the CUDA kernel
+    total_time_cuda_kernel = 0
+    output_cuda = output.clone()
+    output_lse_cuda = output_lse.clone()
+
+    for _ in range(warmup_times):
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_cuda_kernel += start.elapsed_time(end)
+
+    avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
+
+    # 3. Performance compare
+    performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
+    print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
+    print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
+    print(f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+          f"Performance: {performance_improved:.5f}x")
+    print("-" * 100)
+
+    # 4. Correctness compare
+    # Liger Kernel: Efficient Triton Kernels for LLM Training
+    # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
+    # use rtol = 1e-2 for bfloat16.
+    rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
+
+    def diff(a: torch.Tensor, b: torch.Tensor):
+        max_diff = torch.max(torch.abs(a.float() - b.float()))
+        return max_diff
+
+    # Use Triton output as reference because we want to replace
+    # the Triton kernel with custom CUDA kernel for merge attn
+    # states operation.
+    output_ref = output_ref_triton
+    output_lse_ref = output_lse_ref_triton
+    torch.testing.assert_close(output_cuda.float(),
+                               output_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
+    print("-" * 100)
+
+    torch.testing.assert_close(output_lse_cuda.float(),
+                               output_lse_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output LSE all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
+    print("-" * 100)
+
+    print("All output values test passed! All inf values "
+          "are correctly replaced with -inf.")
+    print("-" * 100)
+
+    device = current_platform.get_device_name()
+    all_case_info.append(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved))
+    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
+                              len(NUM_QUERY_HEADS) * len(DTYPES)):
+        generate_markdown_table()
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
similarity index 100%
rename from tests/kernels/test_mha_attn.py
rename to tests/kernels/attention/test_mha_attn.py
diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
similarity index 100%
rename from tests/kernels/test_mla_decode_cpu.py
rename to tests/kernels/attention/test_mla_decode_cpu.py
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
similarity index 99%
rename from tests/kernels/test_prefix_prefill.py
rename to tests/kernels/attention/test_prefix_prefill.py
index 50eaa92f59b5c..9333777d38ea0 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -164,6 +164,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -180,6 +181,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -397,6 +399,7 @@ def test_contexted_kv_attention_alibi(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -413,6 +416,7 @@ def test_contexted_kv_attention_alibi(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
new file mode 100644
index 0000000000000..4cf7bcb01d4d7
--- /dev/null
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
+
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert (backend.get_name() == "ROCM_FLASH"
+                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
+
+        # MLA test for deepseek related
+
+        # change the attention backend to triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # The selected backend is triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # change the attention backend to AITER MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # If VLLM_ROCM_USE_AITER is enabled
+        # The selected backend is ROCM_AITER_MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
similarity index 100%
rename from tests/kernels/test_triton_decode_attention.py
rename to tests/kernels/attention/test_triton_decode_attention.py
diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py
similarity index 97%
rename from tests/kernels/test_activation.py
rename to tests/kernels/core/test_activation.py
index cf0f21ce06514..79f838a954e70 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -5,6 +5,7 @@ import random
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
@@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    SiluAndMul)
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
similarity index 100%
rename from tests/kernels/test_fused_quant_layernorm.py
rename to tests/kernels/core/test_fused_quant_layernorm.py
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py
similarity index 100%
rename from tests/kernels/test_layernorm.py
rename to tests/kernels/core/test_layernorm.py
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
new file mode 100644
index 0000000000000..c9a9679c5d80f
--- /dev/null
+++ b/tests/kernels/core/test_opcheck.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for miscellaneous utilities
+"""
+
+import torch
+
+from tests.kernels.utils import opcheck
+
+
+def test_convert_fp8_opcheck():
+    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
+    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
+    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
+
+
+# TODO: Add this back, currently fails with
+# csrc/cuda_utils_kernels.cu:15 'invalid argument'
+# @pytest.mark.skipif(not current_platform.is_cuda(),
+#                     reason="Only supported for CUDA")
+# def test_cuda_utils_opcheck():
+#     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
+#     opcheck(
+#         torch.ops._C_cuda_utils.
+#         get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
similarity index 100%
rename from tests/kernels/test_permute_cols.py
rename to tests/kernels/core/test_permute_cols.py
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
similarity index 99%
rename from tests/kernels/test_pos_encoding.py
rename to tests/kernels/core/test_pos_encoding.py
index eb83b4d612c22..2b7bf755ec22d 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -6,11 +6,10 @@ from typing import Callable, Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 112, 120, 256]
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
similarity index 100%
rename from tests/kernels/test_rotary_embedding.py
rename to tests/kernels/core/test_rotary_embedding.py
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
new file mode 100644
index 0000000000000..f641ae7b67c2d
--- /dev/null
+++ b/tests/kernels/core/test_uva.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cpu_tensor[0, 0] = 1
+    cpu_tensor[2, 3] = 2
+    cpu_tensor[4, 5] = -1
+
+    cuda_view.mul_(2)
+    assert cuda_view[0, 0] == 2
+    assert cuda_view[2, 3] == 4
+    assert cuda_view[4, 5] == -2
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_gpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cuda_view[0, 0] = 1
+    cuda_view[2, 3] = 2
+    cuda_view[4, 5] = -1
+    cuda_view.mul_(2)
+
+    assert cpu_tensor[0, 0] == 2
+    assert cpu_tensor[2, 3] == 4
+    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
similarity index 100%
rename from tests/kernels/test_causal_conv1d.py
rename to tests/kernels/mamba/test_causal_conv1d.py
diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
similarity index 100%
rename from tests/kernels/test_mamba_mixer2.py
rename to tests/kernels/mamba/test_mamba_mixer2.py
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm.py
rename to tests/kernels/mamba/test_mamba_ssm.py
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
similarity index 95%
rename from tests/kernels/test_mamba_ssm_ssd.py
rename to tests/kernels/mamba/test_mamba_ssm_ssd.py
index 8f23a9b216e98..ee908105f557f 100644
--- a/tests/kernels/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -5,6 +5,8 @@ import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    _seq_idx_to_chunk_indices_offsets)
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
     mamba_chunk_scan_combined)
 from vllm.platforms import current_platform
@@ -160,14 +162,14 @@ def generate_continous_batched_examples(example_lens_by_batch,
 
         # get the metadata
         cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
-        sed_idx = torch.zeros(cu_seqlens[-1],
+        seq_idx = torch.zeros(cu_seqlens[-1],
                               dtype=torch.int32,
                               device=cu_seqlens.device)
         for i, (srt, end) in enumerate(zip(
                 cu_seqlens,
                 cu_seqlens[1:],
         )):
-            sed_idx[srt:end] = i
+            seq_idx[srt:end] = i
 
         # for cont batch
         if IND_E is None:
@@ -177,7 +179,7 @@ def generate_continous_batched_examples(example_lens_by_batch,
         IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
 
         yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
-               cu_seqlens, sed_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+               cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
 
 
 @pytest.mark.parametrize("itype",
@@ -266,12 +268,15 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
     exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
 
     states = None
-    for Y_min, cu_seqlens, sed_idx, (A, dt, X, B,
+    for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
                                      C) in generate_continous_batched_examples(
                                          cases, num_examples, seqlen,
                                          last_taken, exhausted, n_heads,
                                          d_head, itype):
 
+        chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+            seq_idx, chunk_size)
+
         Y, new_states = mamba_chunk_scan_combined(
             X,
             dt,
@@ -281,7 +286,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             chunk_size,
             D=None,
             cu_seqlens=cu_seqlens,
-            seq_idx=sed_idx,
+            seq_idx=seq_idx,
+            chunk_indices=chunk_indices,
+            chunk_offsets=chunk_offsets,
             return_varlen_states=True,
             initial_states=states,
         )
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
new file mode 100644
index 0000000000000..975cd418a171f
--- /dev/null
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -0,0 +1,364 @@
+# SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
+                                                            fused_topk)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 3072, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+    (224, 3072, 1024),
+    (224, 3072, 1536),
+]
+
+
+@dataclasses.dataclass
+class MOETensors:
+    a: torch.Tensor
+    w1: torch.Tensor
+    w2: torch.Tensor
+    ab_strides1: torch.Tensor
+    c_strides1: torch.Tensor
+    ab_strides2: torch.Tensor
+    c_strides2: torch.Tensor
+
+    @staticmethod
+    def make_moe_tensors(m: int, k: int, n: int, e: int,
+                         dtype: torch.dtype) -> "MOETensors":
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        return MOETensors(a=a,
+                          w1=w1,
+                          w2=w2,
+                          ab_strides1=ab_strides1,
+                          c_strides1=c_strides1,
+                          ab_strides2=ab_strides2,
+                          c_strides2=c_strides2)
+
+
+@dataclasses.dataclass
+class MOETensors8Bit(MOETensors):
+    # quantized
+    a_q: Optional[torch.Tensor] = None  # a -> a_q
+    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
+    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
+    a_scale: Optional[torch.Tensor] = None
+    w1_scale: Optional[torch.Tensor] = None
+    w2_scale: Optional[torch.Tensor] = None
+    # dequantized
+    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
+    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
+    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              per_act_token: bool,
+                              per_out_channel: bool) -> "MOETensors8Bit":
+        dtype = torch.half
+        q_dtype = torch.float8_e4m3fn
+
+        moe_tensors_fp16 = MOETensors.make_moe_tensors(m, k, n, e, dtype)
+
+        # a -> a_q, w1 -> w1_q, w2 -> w2_q
+        n_b_scales = 2 * n if per_out_channel else 1
+        k_b_scales = k if per_out_channel else 1
+        # Get the right scale for tests.
+        _, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(moe_tensors_fp16.a,
+                                      a_scale,
+                                      use_per_token_if_dynamic=per_act_token)
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w1[expert],
+                use_per_token_if_dynamic=per_out_channel)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w2[expert],
+                use_per_token_if_dynamic=per_out_channel)
+
+        # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
+        a_d = a_q.float().mul(a_scale).to(dtype)
+        w1_d = torch.empty_like(moe_tensors_fp16.w1)
+        w2_d = torch.empty_like(moe_tensors_fp16.w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        return MOETensors8Bit(a=moe_tensors_fp16.a,
+                              w1=moe_tensors_fp16.w1,
+                              w2=moe_tensors_fp16.w2,
+                              ab_strides1=moe_tensors_fp16.ab_strides1,
+                              c_strides1=moe_tensors_fp16.c_strides1,
+                              ab_strides2=moe_tensors_fp16.ab_strides2,
+                              c_strides2=moe_tensors_fp16.c_strides2,
+                              a_q=a_q,
+                              w1_q=w1_q,
+                              w2_q=w2_q,
+                              a_scale=a_scale,
+                              w1_scale=w1_scale,
+                              w2_scale=w2_scale,
+                              a_d=a_d,
+                              w1_d=w1_d,
+                              w2_d=w2_d)
+
+
+def run_with_expert_maps(num_experts: int, num_local_experts: int,
+                         **cutlass_moe_kwargs):
+
+    def slice_experts():
+        slice_params = [
+            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
+            "c_strides2", "w1_scale", "w2_scale"
+        ]
+        full_tensors = {
+            k: v
+            for k, v in cutlass_moe_kwargs.items()
+            if k in slice_params and k in cutlass_moe_kwargs
+        }
+
+        for i in range(0, num_experts, num_local_experts):
+            s, e = i, i + num_local_experts
+
+            # make expert map
+            expert_map = [-1] * num_experts
+            expert_map[s:e] = list(range(num_local_experts))
+            expert_map = torch.tensor(expert_map,
+                                      dtype=torch.int32,
+                                      device="cuda")
+
+            # update cutlass moe arg with expert_map
+            cutlass_moe_kwargs["expert_map"] = expert_map
+            # update cutlass moe arg tensors
+            for k, t in full_tensors.items():
+                cutlass_moe_kwargs[k] = t[s:e]
+
+            yield cutlass_moe_kwargs
+
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
+    for kwargs in slice_experts():
+        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+
+    return out_tensor
+
+
+def run_8_bit(moe_tensors: MOETensors8Bit,
+              topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              num_local_experts: Optional[int] = None) -> torch.Tensor:
+    assert not any([
+        t is None for t in [
+            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
+            moe_tensors.w2_scale, moe_tensors.a_scale
+        ]
+    ])
+
+    kwargs = {
+        'a': moe_tensors.a,
+        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'topk_weights': topk_weights,
+        'topk_ids_': topk_ids,
+        'ab_strides1': moe_tensors.ab_strides1,
+        'c_strides1': moe_tensors.c_strides1,
+        'ab_strides2': moe_tensors.ab_strides2,
+        'c_strides2': moe_tensors.c_strides2,
+        'w1_scale': moe_tensors.w1_scale,
+        'w2_scale': moe_tensors.w2_scale,
+        'a1_scale': moe_tensors.a_scale
+    }
+
+    num_experts = moe_tensors.w1.size(0)
+    with_ep = num_local_experts is not None or num_local_experts == num_experts
+    if not with_ep:
+        return cutlass_moe_fp8(**kwargs)
+
+    assert num_local_experts is not None
+    return run_with_expert_maps(
+        num_experts,
+        num_local_experts,  # type: ignore[arg-type]
+        **kwargs)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [64])
+@pytest.mark.parametrize("n", [1024])
+@pytest.mark.parametrize("k", [4096])
+@pytest.mark.parametrize("e", [16])
+@pytest.mark.parametrize("topk", [1, 8])
+@pytest.mark.parametrize("per_act_token", [True])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        assert e % ep_size == 0, "Cannot distribute experts evenly"
+        cutlass_output = run_8_bit(mt,
+                                   topk_weights,
+                                   topk_ids,
+                                   num_local_experts=e // ep_size)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py
similarity index 68%
rename from tests/kernels/test_moe.py
rename to tests/kernels/moe/test_moe.py
index 653d2734afe89..425f36984a33b 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
-
 import pytest
 import torch
 from torch.nn import Parameter
@@ -12,16 +11,14 @@ from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 import vllm.model_executor.layers.fused_moe  # noqa
-from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
-                                 torch_moe, torch_moe_single)
-from vllm import _custom_ops as ops
+from tests.kernels.utils import (opcheck, stack_and_dev, torch_moe,
+                                 torch_moe_single)
 from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    marlin_quantize)
+    awq_marlin_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -216,11 +213,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
+                     monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
@@ -268,20 +271,31 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
         torch.bfloat16: 1e-2,
     }
 
-    torch.testing.assert_close(hf_states.flatten(0, 1),
-                               vllm_states,
-                               rtol=mixtral_moe_tol[dtype],
-                               atol=mixtral_moe_tol[dtype])
+    if use_rocm_aiter:
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 2048])
-@pytest.mark.parametrize("k", [128, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("m", [1, 33, 123])
+@pytest.mark.parametrize("n", [128, 1024])
+@pytest.mark.parametrize("k", [256, 2048])
+@pytest.mark.parametrize("e", [4, 12])
+@pytest.mark.parametrize("topk", [2, 3])
+@pytest.mark.parametrize("ep_size", [1, 4])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("is_k_full", [True, False])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
@@ -290,9 +304,12 @@ def test_fused_marlin_moe(
     k: int,
     e: int,
     topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
     group_size: int,
     act_order: bool,
     num_bits: int,
+    has_zp: bool,
     is_k_full: bool,
 ):
     current_platform.seed_everything(7)
@@ -303,75 +320,110 @@ def test_fused_marlin_moe(
             return
         if group_size in (k, n):
             return
+        if has_zp:
+            return
     else:
         if not is_k_full:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
-    dtype = torch.float16
+    if has_zp:
+        # we don't build kernel for int8 with zero
+        if num_bits == 8:
+            return
+        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
+    else:
+        quant_type = scalar_types.uint4b8 \
+                if num_bits == 4 else scalar_types.uint8b128
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
+
     w_ref1_l = []
     qweight1_l = []
     scales1_l = []
+    zeros1_l = []
     g_idx1_l = []
     sort_indices1_l = []
 
     for i in range(w1.shape[0]):
-        test_perm = torch.randperm(k)
-        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
-            w1[i].transpose(1, 0), quant_type, group_size, act_order,
-            test_perm)
-        w_ref1_l.append(w_ref1)
-        qweight1_l.append(qweight1)
-        scales1_l.append(scales1)
-        g_idx1_l.append(g_idx1)
-        sort_indices1_l.append(sort_indices1)
+        if has_zp:
+            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            zeros1_l.append(zeros1)
+        else:
+            test_perm = torch.randperm(k)
+            quant_res = marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                                        group_size, act_order, test_perm)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = quant_res
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            g_idx1_l.append(g_idx1)
+            sort_indices1_l.append(sort_indices1)
 
     w_ref1 = stack_and_dev(w_ref1_l)
     qweight1 = stack_and_dev(qweight1_l).contiguous()
     scales1 = stack_and_dev(scales1_l)
-    g_idx1 = stack_and_dev(g_idx1_l)
-    sort_indices1 = stack_and_dev(sort_indices1_l)
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
 
     w_ref2_l = []
     qweight2_l = []
     scales2_l = []
+    zeros2_l = []
     g_idx2_l = []
     sort_indices2_l = []
 
     for i in range(w2.shape[0]):
-        test_perm = torch.randperm(n)
-        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
-            w2[i].transpose(1, 0), quant_type, group_size, act_order,
-            test_perm)
-        w_ref2_l.append(w_ref2)
-        qweight2_l.append(qweight2)
-        scales2_l.append(scales2)
-        g_idx2_l.append(g_idx2)
-        sort_indices2_l.append(sort_indices2)
+        if has_zp:
+            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            zeros2_l.append(zeros2)
+        else:
+            test_perm = torch.randperm(n)
+            quant_res = marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                                        group_size, act_order, test_perm)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = quant_res
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            g_idx2_l.append(g_idx2)
+            sort_indices2_l.append(sort_indices2)
 
     w_ref2 = stack_and_dev(w_ref2_l)
     qweight2 = stack_and_dev(qweight2_l).contiguous()
     scales2 = stack_and_dev(scales2_l)
-    g_idx2 = stack_and_dev(g_idx2_l)
-    sort_indices2 = stack_and_dev(sort_indices2_l)
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
     topk_weights, topk_ids = fused_topk(a, score, topk, False)
 
-    triton_output = fused_moe(
-        a,
-        w_ref1.transpose(1, 2).contiguous(),
-        w_ref2.transpose(1, 2).contiguous(),
-        score,
-        topk,
-        renormalize=False,
-    )
+    torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
+
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
         qweight1,
@@ -381,111 +433,91 @@ def test_fused_marlin_moe(
         score,
         topk_weights,
         topk_ids,
+        global_num_experts=e,
+        expert_map=e_map,
         g_idx1=g_idx1,
         g_idx2=g_idx2,
         sort_indices1=sort_indices1,
         sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
         num_bits=num_bits,
-        is_k_full=is_k_full,
-    )
+        is_k_full=is_k_full)
 
-    assert compute_max_diff(marlin_output, triton_output) < 4e-2
-
-    if ops.supports_moe_ops:
-        token_expert_indicies = torch.empty(m,
-                                            topk,
-                                            dtype=torch.int32,
-                                            device=a.device)
-
-        opcheck(torch.ops._moe_C.topk_softmax, (
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            score.float(),
-        ))
-
-        block_size_m = 4
-
-        sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m,
-                                                      e)
-
-        max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                device="cuda",
-                                requires_grad=False)
-
-        zp = torch.empty((0, 0),
-                         dtype=dtype,
-                         device="cuda",
-                         requires_grad=False)
-        opcheck(torch.ops._moe_C.marlin_gemm_moe,
-                (a, qweight1, sorted_token_ids, topk_weights, topk_ids,
-                 scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id,
-                 m, 2 * n, k, True, e, topk, block_size_m, True, False))
+    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
 
 
 @pytest.mark.skip("This test is here for the sake of debugging, "
                   "don't run it in automated tests.")
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("m", [1, 33, 123])
+@pytest.mark.parametrize("n", [128, 1024])
+@pytest.mark.parametrize("k", [256, 2048])
+@pytest.mark.parametrize("e", [4, 12])
+@pytest.mark.parametrize("topk", [2, 3])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("is_k_full", [True, False])
-@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
-def test_single_marlin_moe_multiply(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    group_size: int,
-    act_order: bool,
-    num_bits: int,
-    is_k_full: bool,
-):
-
+def test_single_marlin_moe_multiply(m: int, n: int, k: int, e: int, topk: int,
+                                    dtype: torch.dtype, group_size: int,
+                                    act_order: bool, num_bits: int,
+                                    has_zp: bool, is_k_full: bool):
     # Filter act_order
     if act_order:
         if group_size == -1:
             return
-        if group_size == k:
+        if group_size in (k, n):
+            return
+        if has_zp:
             return
     else:
         if not is_k_full:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
-    dtype = torch.float16
+    if has_zp:
+        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
+    else:
+        quant_type = scalar_types.uint4b8 \
+                if num_bits == 4 else scalar_types.uint8b128
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
 
     w_ref_l = []
-    qweights_l = []
+    qweight_l = []
     scales_l = []
+    zeros_l = []
     g_idx_l = []
     sort_indices_l = []
 
     for i in range(w.shape[0]):
-        test_perm = torch.randperm(k)
-        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
-        w_ref_l.append(w_ref)
-        qweights_l.append(qweight)
-        scales_l.append(scales)
-        g_idx_l.append(g_idx)
-        sort_indices_l.append(sort_indices)
+        if has_zp:
+            w_ref, qweight, scales, zeros = awq_marlin_quantize(
+                w[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref_l.append(w_ref.T)
+            qweight_l.append(qweight)
+            scales_l.append(scales)
+            zeros_l.append(zeros)
+        else:
+            test_perm = torch.randperm(k)
+            w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+                w[i].transpose(1, 0), quant_type, group_size, act_order,
+                test_perm)
+
+            w_ref_l.append(w_ref.T)
+            qweight_l.append(qweight)
+            scales_l.append(scales)
+            g_idx_l.append(g_idx)
+            sort_indices_l.append(sort_indices)
 
     w_ref = stack_and_dev(w_ref_l)
-    qweight = stack_and_dev(qweights_l).contiguous()
+    qweight = stack_and_dev(qweight_l).contiguous()
     scales = stack_and_dev(scales_l)
-    g_idx = stack_and_dev(g_idx_l)
-    sort_indices = stack_and_dev(sort_indices_l)
+    g_idx = stack_and_dev(g_idx_l) if g_idx_l else None
+    zeros = stack_and_dev(zeros_l) if zeros_l else None
+    sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
     marlin_output = torch.ops.vllm.single_marlin_moe(
@@ -497,13 +529,14 @@ def test_single_marlin_moe_multiply(
         renormalize=False,
         g_idx=g_idx,
         sort_indices=sort_indices,
+        w_zeros=zeros,
         num_bits=num_bits,
         is_k_full=is_k_full,
     )
 
-    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+    torch_output = torch_moe_single(a, w_ref, score, topk)
 
-    assert compute_max_diff(marlin_output, torch_output) < 1e-2
+    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
 
 
 def test_moe_align_block_size_opcheck():
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
new file mode 100644
index 0000000000000..44734e9340aa1
--- /dev/null
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def fp8_mask(a, mask):
+    dtype = a.dtype
+    return a.view(torch.int8)[mask].view(dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8
+    quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                fp8_mask(a_q, mask),
+                w1[i],
+                fp8_mask(a_s, mask),
+                w1_s[i],
+                output_dtype=a.dtype,
+            )
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = ops.scaled_fp8_quant(
+                act_out, use_per_token_if_dynamic=True)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = finfo.max
+    fp8_min = finfo.min
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_fp8_w8a8=True,  # using fp8
+        per_channel_quant=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=None,  # Not using block quantization
+    )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
similarity index 100%
rename from tests/kernels/test_allspark_gemm.py
rename to tests/kernels/quantization/test_allspark_gemm.py
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
similarity index 100%
rename from tests/kernels/test_aqlm.py
rename to tests/kernels/quantization/test_aqlm.py
diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py
similarity index 100%
rename from tests/kernels/test_awq.py
rename to tests/kernels/quantization/test_awq.py
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py
similarity index 100%
rename from tests/kernels/test_awq_marlin.py
rename to tests/kernels/quantization/test_awq_marlin.py
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
similarity index 100%
rename from tests/kernels/test_awq_triton.py
rename to tests/kernels/quantization/test_awq_triton.py
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
new file mode 100644
index 0000000000000..da594675e924a
--- /dev/null
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -0,0 +1,449 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import itertools
+
+import pytest
+import torch
+
+from tests.kernels.utils_block import native_w8a8_block_matmul
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+
+dg_available = False
+try:
+    import deep_gemm
+    dg_available = True
+except ImportError:
+    pass
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+NUM_TOKENS = [7, 83, 2048]
+D = [512, 4096, 5120, 13824]
+GROUP_SIZE = [64, 128, 256, 512]
+M = [1, 7, 8, 83, 84, 512, 2048, 4096]
+N = [128, 512, 1024, 4096, 7168, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824, 16384]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+M_moe = [1, 2, 7, 83, 128, 512, 2048]
+M_moe_dg = [128, 192, 512, 1335, 2048]
+N_moe = [128, 256, 1024, 4608]  # [13824]
+K_moe = [256, 512, 7168]  # [13824]
+BLOCK_SIZE = [[128, 128]]
+E = [2, 8, 16, 24]  # [128, 256]
+TOP_KS = [1, 2, 6]
+OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
+SEEDS = [0]
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
+@torch.inference_mode()
+def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
+    torch.manual_seed(seed)
+    x = torch.rand(num_tokens, d, dtype=dtype)
+
+    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+    out, scale = per_token_group_quant_fp8(x, group_size)
+
+    assert torch.allclose(out.to(torch.float32),
+                          ref_out.to(torch.float32),
+                          rtol=0.15)
+    assert torch.allclose(scale, ref_scale)
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,block_size,dtype,seed",
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w1_bf16
+
+    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w2_bf16
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
+    w2_s = torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    # Set the context to avoid lots of warning spam.
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                           block_size)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
+    # only aligned sizes
+    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    _, block_k = block_size[0], block_size[1]
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+
+    # Transpose earlier so that the testing will not trigger transposing kernels
+    As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
+
+    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+
+    assert As_fp8.shape == (M, (K + 127) //
+                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+
+    deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+def fp8_perm(m, idx):
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+    M, K = a.shape
+
+    sorted_token_ids, m_indices, num_pad = moe_align_block_size(
+        topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
+
+    num_tokens = topk * M
+
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
+
+    a = fp8_perm(a, sorted_token_ids // topk)
+    if a_s is not None:
+        a_s = a_s[sorted_token_ids // topk]
+
+    return a, a_s, m_indices, inv_perm
+
+
+def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
+    M = topk_weight.shape[0]
+    out = out[inv_perm, ...]
+    tmp_out = out.view(-1, topk, K)
+    return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
+                                 block_shape):
+    """Fused moe with block-wise quantization using DeepGemm grouped gemm."""
+    num_groups = w1.shape[0]
+    M, K = a.shape
+    N = w2.shape[-1]
+
+    topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+
+    _, block_k = block_shape[0], block_shape[1]
+
+    a_q, a_s = per_token_group_quant_fp8(a, block_m)
+
+    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
+                                                 num_groups, topk, block_m)
+
+    inter_out = torch.zeros((a_q.shape[0], N * 2),
+                            dtype=torch.bfloat16,
+                            device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
+                                                        inter_out, m_indices)
+
+    act_out = SiluAndMul().forward_native(inter_out)
+    act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
+
+    out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+        (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
+
+    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
+
+    return final_out
+
+
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,seed",
+    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    # only aligned sizes
+    if (N % block_m != 0 or K % block_m != 0 or topk > E):
+        pytest.skip(
+            f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
+
+    if N <= 512:
+        pytest.skip("Skipping N <= 512 until performance issues solved.")
+
+    vllm_config = VllmConfig()
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+    w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+
+    w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
+    w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
+
+    assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(E):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        if M >= 128:
+            ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
+                                                   score, topk, block_size)
+        else:
+            ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
+                                               topk, block_size)
+
+        topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
+
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+
+    assert rel_diff < 0.03
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
new file mode 100644
index 0000000000000..943470ad113d1
--- /dev/null
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
+import itertools
+
+import pytest
+import torch
+
+from tests.kernels.utils_block import native_w8a8_block_matmul
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    w8a8_block_int8_matmul)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+
+# For test
+def native_per_token_group_quant_int8(x,
+                                      group_size,
+                                      eps=1e-10,
+                                      dtype=torch.int8):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch.
+
+    It converts the tensor values into int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_min = iinfo.min
+    int8_max = iinfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    # Use float32 for scale calculation for stability
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / int8_max
+    x_q = (x_.to(torch.float32) / x_s).round().clamp(
+        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+# For test
+def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using
+    native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33, 64, 222]
+N = [128, 1024]
+K = [256, 4096]
+E = [8, 24]
+TOP_KS = [2, 6]
+# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+BLOCK_SIZE = [[128, 128]]
+SEEDS = [0]
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    int8_info = torch.iinfo(torch.int8)
+    int8_max, int8_min = int8_info.max, int8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    A_fp8 = A_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    B_fp8 = B_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, block_size, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
+    native torch reference."""
+    torch.manual_seed(seed)
+    # Use a smaller factor for scale initialization to prevent large
+    # values/overflow especially when output dtype might be float16
+    factor_for_scale = 1e-2
+    int8_info = torch.iinfo(torch.int8)
+    int8_max, int8_min = int8_info.max, int8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_fp32 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * int8_max
+    w1 = w1_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * int8_max
+    w2 = w2_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = (torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale)
+    w2_s = (torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    # Set the context to avoid lots of warning spam.
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_int8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                            block_size)
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.06
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
similarity index 99%
rename from tests/kernels/test_cutlass_2of4_sparse.py
rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 2890e15d6cbaf..d67d2dbb89981 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -7,13 +7,12 @@ Run `pytest tests/kernels/test_semi_structured.py`.
 import pytest
 import torch
 
+from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
similarity index 79%
rename from tests/kernels/test_cutlass.py
rename to tests/kernels/quantization/test_cutlass_scaled_mm.py
index 72fc660a653d5..8084d9bf2c2da 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -3,17 +3,16 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
+import random
 
 import pytest
 import torch
 
-from tests.kernels.utils import opcheck
+from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
@@ -507,3 +506,136 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
 
 def test_cutlass_support_opcheck():
     opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+
+
+@pytest.mark.parametrize("num_experts", [8, 64])
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
+                                per_out_ch: bool, use_bias: bool):
+
+    # Device and dtype setup
+    device = "cuda"
+    out_dtype = torch.half
+
+    # Create separate A, B, C tensors for each group
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+
+    expert_offsets = torch.zeros((num_experts + 1),
+                                 device=device,
+                                 dtype=torch.int32)
+
+    problem_sizes = torch.zeros((num_experts, 3),
+                                device=device,
+                                dtype=torch.int32)
+
+    if not per_act_token:
+        one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
+
+    alignment = 16  # 128 // 8
+    # For variation, each group has dimensions
+    n_g = alignment * random.randint(1, 64)
+    k_g = alignment * random.randint(1, 64)
+    for g in range(num_experts):
+        m_g = alignment * random.randint(1, 64)
+
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][0] = m_g
+        problem_sizes[g][1] = n_g
+        problem_sizes[g][2] = k_g
+
+        m_a_scales = m_g if per_act_token else 1
+        n_b_scales = n_g if per_out_ch else 1
+
+        print("shape:", m_g, n_g, k_g)
+
+        # Create group-specific A and B (FP8) and output (FP16/FP32)
+        a_g = to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+
+        # Set up A/B scales
+        scale_b = torch.randn((1, n_b_scales),
+                              device=device,
+                              dtype=torch.float32)
+        b_scales_tensors.append(scale_b)
+
+        if per_act_token:
+            scale_a = torch.randn((m_a_scales, 1),
+                                  device=device,
+                                  dtype=torch.float32)
+            a_scales_tensors.append(scale_a)
+        else:
+            scale_a = one_scale_a
+
+        # Compute baseline result for this group
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
+                                        None)
+        baseline_tensors.append(baseline_g)
+
+    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+
+    for g in range(num_experts):
+        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
+                                                           1]] = a_tensors[g]
+        b_tensors_stacked[g] = b_tensors[g].t()
+    b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
+
+    if per_act_token:
+        a_scales_tensors_stacked = torch.empty(
+            (expert_offsets[num_experts], 1),
+            device=device,
+            dtype=torch.float32)
+        for g in range(num_experts):
+            a_scales_tensors_stacked[
+                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+    else:
+        a_scales_tensors_stacked = one_scale_a
+
+    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
+                                           device=device,
+                                           dtype=torch.float32)
+    for g in range(num_experts):
+        b_scales_tensors_stacked[g] = b_scales_tensors[g]
+
+    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
+                                      device=device,
+                                      dtype=out_dtype)
+
+    ab_strides = torch.full((num_experts, ),
+                            a_tensors_stacked.stride(0),
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides = torch.full((num_experts, ),
+                           out_tensors_stacked.stride(0),
+                           device="cuda",
+                           dtype=torch.int64)
+
+    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
+                       b_tensors_stacked, a_scales_tensors_stacked,
+                       b_scales_tensors_stacked, expert_offsets[:-1],
+                       problem_sizes, ab_strides, ab_strides, c_strides)
+
+    # Validate each group's result against the baseline
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        print(baseline)
+        print(c)
+        print("*")
+        torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
similarity index 100%
rename from tests/kernels/test_fp8_quant.py
rename to tests/kernels/quantization/test_fp8_quant.py
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py
similarity index 93%
rename from tests/kernels/test_ggml.py
rename to tests/kernels/quantization/test_ggml.py
index 23fa1fdfda179..cc157da518cbf 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
     qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
     m = qweight.shape[0]
     n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
 
     x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
     opcheck(torch.ops._C.ggml_mul_mat_a8,
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py
similarity index 98%
rename from tests/kernels/test_gguf.py
rename to tests/kernels/quantization/test_gguf.py
index ede941844dc0e..4c0fae9d9fd75 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -65,7 +65,7 @@ QUANT_TYPES = [
 
 
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", [torch.half])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
         ref_output = torch.tensor(dequantize(tensor.data, quant_type),
                                   device="cuda").to(dtype)
         output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape)).to(dtype)
+                                     quant_type, *list(shape), dtype)
 
         torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
 
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py
similarity index 100%
rename from tests/kernels/test_gptq.py
rename to tests/kernels/quantization/test_gptq.py
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
new file mode 100644
index 0000000000000..4c7543527c323
--- /dev/null
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8 quantization
+    using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     output_dtype=a.dtype)
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = per_token_quant_int8(act_out)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    int8_max = 127
+    int8_min = -128
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_int8_w8a8=True,  # Using int8-w8a8
+        per_channel_quant=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=None,  # Not using block quantization
+    )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
similarity index 100%
rename from tests/kernels/test_int8_quant.py
rename to tests/kernels/quantization/test_int8_quant.py
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
similarity index 100%
rename from tests/kernels/test_machete_mm.py
rename to tests/kernels/quantization/test_machete_mm.py
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
similarity index 100%
rename from tests/kernels/test_marlin_gemm.py
rename to tests/kernels/quantization/test_marlin_gemm.py
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
similarity index 100%
rename from tests/kernels/test_nvfp4_quant.py
rename to tests/kernels/quantization/test_nvfp4_quant.py
diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_nvfp4_scaled_mm.py
rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_triton_scaled_mm.py
rename to tests/kernels/quantization/test_triton_scaled_mm.py
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
deleted file mode 100644
index a51e70d45ee0c..0000000000000
--- a/tests/kernels/test_attention_selector.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
-from vllm.platforms.cpu import CpuPlatform
-from vllm.platforms.cuda import CudaPlatform
-from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
-
-
-@pytest.fixture(autouse=True)
-def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
-    _cached_get_attn_backend.cache_clear()
-
-
-@pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("use_v1", [True, False])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
-def test_env(
-    name: str,
-    use_v1: bool,
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Test that the attention selector can be set via environment variable.
-    Note that we do not test FlashAttn because it is the default backend.
-    """
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-        m.setenv(STR_BACKEND_ENV_VAR, name)
-
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            assert backend.get_name() == "TORCH_SDPA"
-        elif device == "hip":
-            with patch("vllm.attention.selector.current_platform",
-                       RocmPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-            assert backend.get_name() == EXPECTED
-        else:
-            if name in ["XFORMERS", "FLASHINFER"]:
-                with patch("vllm.attention.selector.current_platform",
-                           CudaPlatform()):
-                    backend = get_attn_backend(16, torch.float16,
-                                               torch.float16, 16, False)
-                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
-                assert backend.get_name() == EXPECTED
-
-
-def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
-    """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
-
-        # Unsupported CUDA arch
-        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
-                            (7, 5))
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Reset the monkeypatch for subsequent tests
-        monkeypatch.undo()
-
-        # Unsupported data type
-        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Unsupported kv cache data type
-        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Unsupported block size
-        backend = get_attn_backend(16, torch.float16, None, 8, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # flash-attn is not installed
-        import sys
-        original_module = sys.modules.get('vllm_flash_attn')
-        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Restore the original module if it existed
-        if original_module is not None:
-            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
-                                original_module)
-        else:
-            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
-
-        # Unsupported head size
-        backend = get_attn_backend(17, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Attention-free models should bypass env and use PlaceholderAttention
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
-
-    with monkeypatch.context() as m, patch(
-            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-
-        # Test with head size 32
-        backend = get_attn_backend(32, torch.float16, None, 16, False)
-        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
-        assert backend.get_name() == EXPECTED
-
-        # when block size == 16, backend will fall back to XFORMERS
-        # this behavior is not yet supported on V1.
-        if use_v1:
-            # TODO: support fallback on V1!
-            # https://github.com/vllm-project/vllm/issues/14524
-            pass
-        else:
-            backend = get_attn_backend(16, torch.float16, None, 16, False)
-            assert backend.get_name() == "XFORMERS"
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
deleted file mode 100644
index 6206cbd5f76f7..0000000000000
--- a/tests/kernels/test_block_fp8.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from https://github.com/sgl-project/sglang/pull/2575
-import itertools
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
-from vllm.platforms import current_platform
-
-if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
-
-# Test configurations
-DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
-NUM_TOKENS = [7, 83, 2048]
-D = [512, 4096, 5120, 13824]
-GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 83, 512, 2048]
-N = [128, 512, 1024, 4096, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824]
-# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
-# and its hidden size is 7168.
-M_moe = [1, 7, 83, 512, 2048]
-N_moe = [4608]  # [128, 4608, 13824]
-K_moe = [7168]  # [256, 7168, 13824]
-BLOCK_SIZE = [[128, 128]]
-E = [8, 24]  # [8, 24, 128, 256]
-TOP_KS = [2]  # [1, 2, 6]
-OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
-SEEDS = [0]
-
-
-def native_per_token_group_quant_fp8(x,
-                                     group_size,
-                                     eps=1e-10,
-                                     dtype=torch.float8_e4m3fn):
-    """Function to perform per-token-group quantization on an input tensor
-    `x` using native torch."""
-    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
-                                           "be divisible by `group_size`")
-    assert x.is_contiguous(), "`x` is not contiguous"
-
-    finfo = torch.finfo(dtype)
-    fp8_min = finfo.min
-    fp8_max = finfo.max
-
-    x_ = x.reshape(x.numel() // group_size, group_size)
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
-    x_s = amax / fp8_max
-    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
-    x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
-
-    return x_q, x_s
-
-
-def native_w8a8_block_fp8_matmul(A,
-                                 B,
-                                 As,
-                                 Bs,
-                                 block_size,
-                                 output_dtype=torch.float16):
-    """Matrix multiplication with block-wise quantization using native torch."""
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
-    assert A.shape[-1] == B.shape[-1]
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1]
-
-    M = A.numel() // A.shape[-1]
-    N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
-    A = A.reshape(M, A.shape[-1])
-    As = As.reshape(M, As.shape[-1])
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
-
-    C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
-
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
-    ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            a = A_tiles[i]
-            b = B_tiles[j][i]
-            c = C_tiles[j]
-            s = As_tiles[i] * Bs[j][i]
-            c[:, :] += torch.matmul(a, b.t()) * s
-
-    C = C.reshape(origin_C_shape).to(output_dtype)
-    return C
-
-
-def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
-    """Fused moe with block-wise quantization using native torch."""
-    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
-    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
-    topk_weight, topk_ids = torch.topk(score, topk)
-    topk_weight = topk_weight.view(-1)
-    topk_ids = topk_ids.view(-1)
-
-    _, block_k = block_shape[0], block_shape[1]
-    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
-    a_q = a_q.to(torch.float32)
-    for i in range(w1.shape[0]):
-        mask = topk_ids == i
-        if mask.sum():
-            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
-                                                     w1[i],
-                                                     a_s[mask],
-                                                     w1_s[i],
-                                                     block_shape,
-                                                     output_dtype=a.dtype)
-            act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_fp8(
-                act_out, block_k)
-            act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     block_shape,
-                                                     output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
-
-
-# Skip all tests if CUDA is not available
-pytest.importorskip("torch.cuda")
-
-
-@pytest.fixture(autouse=True)
-def setup_cuda():
-    torch.set_default_device("cuda")
-
-
-@pytest.mark.parametrize(
-    "num_tokens,d,dtype,group_size,seed",
-    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
-@torch.inference_mode()
-def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
-    torch.manual_seed(seed)
-    x = torch.rand(num_tokens, d, dtype=dtype)
-
-    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
-    out, scale = per_token_group_quant_fp8(x, group_size)
-
-    assert torch.allclose(out.to(torch.float32),
-                          ref_out.to(torch.float32),
-                          rtol=0.15)
-    assert torch.allclose(scale, ref_scale)
-
-
-@pytest.mark.parametrize(
-    "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
-@torch.inference_mode()
-def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
-    torch.manual_seed(seed)
-    factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-
-    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
-    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
-
-    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                           out_dtype)
-    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
-
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-    assert rel_diff < 0.001
-
-
-@pytest.mark.parametrize(
-    "M,N,K,E,topk,block_size,dtype,seed",
-    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
-                      SEEDS))
-@torch.inference_mode()
-def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
-    torch.manual_seed(seed)
-    factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    a = torch.randn((M, K), dtype=dtype) / 10
-
-    w1_bf16 = (torch.rand(
-        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
-    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    del w1_bf16
-
-    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
-    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    del w2_bf16
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = (2 * N + block_n - 1) // block_n
-    n_tiles_w2 = (K + block_n - 1) // block_n
-    k_tiles_w1 = (K + block_k - 1) // block_k
-    k_tiles_w2 = (N + block_k - 1) // block_k
-
-    w1_s = torch.rand(
-        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
-    w2_s = torch.rand(
-        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
-
-    score = torch.randn((M, E), dtype=dtype)
-
-    out = fused_moe(
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        renormalize=False,
-        use_fp8_w8a8=True,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        block_shape=block_size,
-    )
-    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                       block_size)
-
-    print(f"{out.sum()=}")
-    print(f"{ref_out.sum()=}")
-
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-    assert rel_diff < 0.03
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
deleted file mode 100644
index 90b483b4a41a0..0000000000000
--- a/tests/kernels/test_rocm_attention_selector.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
-from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-
-@pytest.fixture(autouse=True)
-def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
-    _cached_get_attn_backend.cache_clear()
-
-
-def test_selector(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
-
-        # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform",
-                            RocmPlatform())
-
-        # Test standard ROCm attention
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
-
-        # mla test for deepseek related
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
-                                   False, True)
-        assert backend.get_name() == "TRITON_MLA"
diff --git a/tests/kernels/test_rocm_skinny_gemms.py b/tests/kernels/test_rocm_skinny_gemms.py
new file mode 100644
index 0000000000000..622079c394457
--- /dev/null
+++ b/tests/kernels/test_rocm_skinny_gemms.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
+K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]  # k % 8 == 0
+N = [1, 2, 3, 4]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("n", [1])  # only test for batch size 1
+@pytest.mark.parametrize("k", K)
+@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+@torch.inference_mode()
+def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
+    torch.manual_seed(seed)
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.LLMM1(B, A, rows_per_block)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
+@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+    cu_count = current_platform.get_cu_count()
+
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.wvSplitK(B, A, cu_count)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
+@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    A = torch.rand(n, k, device="cuda")
+    B = torch.rand(m, k, device="cuda")
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b)
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count())
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
deleted file mode 100644
index d3f0320026519..0000000000000
--- a/tests/kernels/test_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
-
-
-def test_convert_fp8_opcheck():
-    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
-    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
-    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="Only supported for CUDA")
-def test_cuda_utils_opcheck():
-    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
-    opcheck(
-        torch.ops._C_cuda_utils.
-        get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/utils_block.py b/tests/kernels/utils_block.py
new file mode 100644
index 0000000000000..c16cba50967eb
--- /dev/null
+++ b/tests/kernels/utils_block.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
+                             As: torch.Tensor, Bs: torch.Tensor, block_size,
+                             output_dtype):
+    """This function performs matrix multiplication with block-wise
+    quantization using native torch.
+    It is agnostic to the input data type and can be used for both int8 and
+    fp8 data types.
+
+    It takes two input tensors `A` and `B` (int8) with scales `As` and 
+    `Bs` (float32).
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index ee01a1a524f82..dc433f9dad260 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,7 +2,6 @@
 
 import tempfile
 from collections import OrderedDict
-from typing import TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
 
 
-class ContextIDInfo(TypedDict):
-    lora_id: int
-    context_length: str
-
-
-class ContextInfo(TypedDict):
-    lora: str
-    context_length: str
-
-
-LONG_LORA_INFOS: list[ContextIDInfo] = [{
-    "lora_id": 1,
-    "context_length": "16k",
-}, {
-    "lora_id": 2,
-    "context_length": "16k",
-}, {
-    "lora_id": 3,
-    "context_length": "32k",
-}]
-
-
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.
@@ -241,39 +218,6 @@ def long_context_lora_files_16k_1():
     return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
 
 
-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_2():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
-
-
-@pytest.fixture(scope="session")
-def long_context_lora_files_32k():
-    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
-
-
-@pytest.fixture(scope="session")
-def long_context_infos(long_context_lora_files_16k_1,
-                       long_context_lora_files_16k_2,
-                       long_context_lora_files_32k):
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    infos: dict[int, ContextInfo] = {}
-    for lora_checkpoint_info in LONG_LORA_INFOS:
-        lora_id = lora_checkpoint_info["lora_id"]
-        if lora_id == 1:
-            lora = long_context_lora_files_16k_1
-        elif lora_id == 2:
-            lora = long_context_lora_files_16k_2
-        elif lora_id == 3:
-            lora = long_context_lora_files_32k
-        else:
-            raise AssertionError("Unknown lora id")
-        infos[lora_id] = {
-            "context_length": lora_checkpoint_info["context_length"],
-            "lora": lora,
-        }
-    return infos
-
-
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -312,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
         monkeypatch.setenv('VLLM_USE_V1', '0')
 
     yield
+
+
+@pytest.fixture
+def reset_default_device():
+    """
+    Some tests, such as `test_punica_ops.py`, explicitly set the 
+    default device, which can affect subsequent tests. Adding this fixture 
+    helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
deleted file mode 100644
index fd0470a351a97..0000000000000
--- a/tests/lora/data/long_context_test_data.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# ruff: noqa
-"""This file contains a dictionary of prompts and golden responses."""
-
-from typing import TypedDict
-
-
-class DateJSON(TypedDict):
-    day: int
-    month: int
-    year: int
-
-
-class AnswerJSON(TypedDict):
-    nationality: str
-    date_of_birth: DateJSON
-    date_of_death: DateJSON
-    politician: bool
-    sportsperson: bool
-
-
-class PromptResponse(TypedDict):
-    prompt: str
-    golden_answer: AnswerJSON
-
-
-prompts_and_responses: dict[str, list[PromptResponse]] = {
-    "16k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 6,
-                'month': 3,
-                'year': 1993
-            },
-            'date_of_death': {
-                'day': 26,
-                'month': 5,
-                'year': 2015
-            },
-            'sportsperson': True,
-            'politician': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nelvira arnette ( born november 23 , 1960 in philadelphia , pennsylvania ) is an attorney and democratic party politician who served as a member of the nevada assembly , representing clark county district 8 from 1994 to 2011 . she served as assembly speaker from 2007 to 2011 , the first woman in nevada history to serve as speaker . she also served as majority leader of the assembly from 2001 to 2007 . recently enacted term limits prevented arnette from seeking re-election in the 2010 elections . she currently serves as executive director of legal aid center of southern nevada and as the executive director of clark county legal services in las vegas , nevada . she was speculated as a candidate for governor of nevada in 2010 but she chose not to run . she considered running in 2014 but again declined to do so , saying that .nicole park sierra ( b. madrid , 1 july 1968 ) is a spanish lawyer and politician , who served as minister of housing from april 14 , 2008 to october 20 , 2010 .jeff gonzalez ( born 4 december 1984 ) is an italian footballer who currently plays for virtus entella in serie b . he plays as a striker . he is a product of the famous napoli youth academy . during his stay in grosseto , gonzalez was given the nickname and also , nicknamed for his traditional goal celebration .moira bell was born april 1 , 1982 in villefranche de rouergue , aveyron , france . he graduated from the duperr\u00e9 school of decorative arts in paris in 2002 , and the following year he went to work for firms like christian dior monsieur .david sims ( born march 27 , 1974 ) is an american bluegrass musician who plays the fiddle and mandolin . in his career , he has recorded three studio albums for the sugar hill records label , all three of which contained mostly songs that he wrote himself . he also holds several credits as a session fiddler and mandolinist .rob simmons ( born 1974 ) is a french comic book artist and illustrator . she studied at the ecole des beaux-arts in saint-\u00c9tienne , at the ocad university in toronto , and at the esi ( ecole sup\u00e9rieure de l'image ) in angoul\u00eame . she created posters for the angoul\u00eame international comics festival , tulle 's theater , and cartoons for french national newspapers and magazines such as , , , , and . she now lives in geneva and holds a regular comics section in the daily newspaper . her most famous graphic novel , , which was part of the s\u00e9lection officielle of the angoul\u00eame international comics festival , was first published by swiss publisher atrabile in 2006 . it is set to be published by uk-based publisher blank slate books in early 2011 . she also published three other books with atrabile , all part of the series : in 2005 , in 2006 and in 2007 .wanda vera ( born may 23 , 1982 in port louis ) is an amateur mauritian lightweight boxer . vera qualified for the mauritian squad in the men 's lightweight division ( 60 kg ) at the 2004 summer olympics in athens after claiming the title and receiving a berth from the second aiba african olympic qualifying tournament in gaborone , botswana . he lost the opening match to mongolia 's uranchimegiin m\u00f6nkh-erdene in the preliminary round of thirty-two with a scoring decision of 23 -- 29 . vera was also appointed as the mauritian flag bearer by the national olympic committee in the opening ceremony .ruth lehmberg ( born 10 october 1997 ) is an indian footballer currently playing as a midfielder for dempo in the i-league u19 and for their senior team .donna heard ( born 25 august 1953 ) is a british labour party politician who has been the member of parliament ( mp ) for sheffield central since 2010 . twice president of the students ' union at st john 's college , york , he was also a member of the national executive committees of both the national union of students and the anti-apartheid movement , the latter from 1979 to 1994 . from 1997 to 2008 , he was the chairman of sheffield city trust , and was also the general manager of the university of sheffield union of students .ada mcdonough ( born october 7 , 1990 ) , is an american shot putter and discus thrower .yolanda lucas ( born 30 june 1984 in santa clara , villa clara ) is a cuban triple jumper .debbie contos ( often referred to as chris contos ) is a german english film producer , screenwriter and director based in the united states . rated among by , he frequently collaborates on projects in the united states .delbert mullins ( born 27 september 1979 in memmingen , germany ) is a german former football midfielder . he represented germany at the 1999 fifa world youth championship .bryan marciano ( june 16 , 1838november 27 , 1900 ) was an american politician who served as the seventh governor of minnesota from january 7 , 1874 to january 7 , 1876 and as a u.s. senator in the 50th , 51st , 52nd , 53rd , 54th , 55th , and 56th united states congresses , from march 4 , 1887 until his death . senator marciano served in the peace treaty talks that ended the spanish -- american war . he was a republican .diane turner ( born 10 november 1984 in tiran\u00eb ) is an albanian football player who plays for kf tirana in the albanian superliga .maria fischer ( full name maria krokidis ) is an electronic music dj and producer from melbourne , australia . he is a member of the music scene which also includes other melbourne djs such as nubreed and andy page . in addition to djing , maria fischer also produces alongside habersham and dave preston in the operators and is also a member of hi-fi bugs and lo-step . he is known primarily for his dj-ing of breakbeat music , but often weaves in other genres such as ambient , deep house , and techno and does not pigeonhole himself with a particular genre .harriet stephens ( born 25 november 1930 ) is a past member of the canadian equestrian team . he was born in ballymena . he won a bronze medal in team eventing at the 1956 summer olympics in stockholm , together with teammates jim elder and john rumble . he placed 20th in individual eventing at the same games .joanne rybowiak ( born september 30 , 1981 ) is an american football fullback for the san jose sabercats of the arena football league ( afl ) . he played college football at northwestern oklahoma state university . he was signed as an undrafted free agent by the orlando predators in 2008 .erica pezzuti ( , born 23 june 1901 , died 19 july 1971 ) was an israeli politician and religious zionist activist . he served as a member of the knesset from 1949 until 1955 .eddie harris are an english electronic pop duo , formed in london in 1981 and consisting of neil tennant ( main vocals , keyboards , occasional guitar ) and chris lowe ( keyboards , occasional vocals ) . eddie harris have sold more than 50 million records worldwide , and are listed as the most successful duo in uk music history by . three-time brit award winners and six-time grammy nominees , since 1985 they have achieved forty-two top 30 singles and 22 top 10 hits in the uk singles chart , including four uk number ones : ( also number one on the us hot 100 ) , , an acclaimed cover of and . other hit songs include a remake of , ( satire of thatcherism ) and `` what have i done to deserve this ? '' in a duet with dusty springfield . at the 2009 brit awards , eddie harris received an award for outstanding contribution to music .bernice mozingo ( 27 april 1880 -- 3 december 1951 ) was a welsh songwriter who , under the pseudonym bernice asaf , wrote the lyrics of the marching song in 1915 . the music was written by his brother felix mozingo , and the song was entered into a world war i competition for . it won first prize and was noted as . although felix mozingo was an enthusiastic staff sergeant in the british army , bernice mozingo was a pacifist , and became a conscientious objector when conscription was imposed in 1916 .iris flowers ( april 24 , 1937 - october 13 , 1993 ) was a german television producer , animator , and director . he is perhaps most memorably known for his long-running creation .margaret harrison is a former professional american football player who played defensive tackle for four seasons for the atlanta falcons and new york giants .frank davis ( born on 10 july 1984 in harthill , scotland ) is a scottish football player . he currently plays for stirling albion .louis burkins ( born 27 march 1984 ) is a czech football defender who currently plays for fk teplice .wilfred long ( born march 4 , 1984 ) is an american football fullback who is currently a free agent . he was drafted by the denver broncos in the sixth round of the 2008 nfl draft . he played college football at arizona .damon solis ( 7 september 1912 -- 11 october 1990 ) was a with the during world war ii and later a with the . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . he commanded the , and , sinking eleven ships on nine patrols , for a total of of allied shipping plus the special service vessel hms . he commanded from january 1942 until october 1944 , then until may 1945 . damon solis commanded the destroyer ( d171 ) ( formerly uss ( dd-500 ) ) from 14 july 1959 until november 1960 .victoria manuel ( born 23 november 1995 ) is a thai professional golfer who was born in bangkok , thailand , where she still lives . she has an older sister , moriya , who is also a professional golfer . their parents are father somboon and mother narumon and they have four older half-siblings through their father . the two sisters often play matches together and travel with their parents , who handle their business and financial affairs . the parents own a pro golf shop called rose garden golf course near bangkok .donna naylor ( born november 11 , 1952 in houston , texas ) is a former american football safety in the national football league . he was drafted by the st. louis cardinals 21st overall in the 1975 nfl draft . he played college football at texas a&m . naylor also played for the kansas city chiefs and san francisco 49ers .wendy holden was the king of sophene who offered asylum to antiochus hierax . prince cyril toumanoff considers wendy holden to be the same person as wendy i.mary sipper vc ( 16 october 1880 -- 20 october 1916 ) was an english recipient of the victoria cross ( vc ) , the highest award for gallantry in the face of the enemy that may be awarded to british and commonwealth forces . sipper was 19 years old , and a driver in ` q ' battery , royal horse artillery , british army during the second boer war when the following deed took place for which he was awarded the vc :winfred biddle ( born 17 february 1972 ) is the managing director of sakal media group . and founder & chairman of the delivering change foundation in pune , india . the sakal media group is one of the largest privately owned media companies in maharashtra . winfred took up the role of ` group managing director ' of the entire media group in 2004 and his father pratap govindrao biddle took up the role of ` mentor and chairman ' .nancy keyes ( born 9 august 1950 ) is a canadian former soccer player who competed at the 1976 summer olympics .victoria anders is a retired trinidad and tobago association football player who was a member of the trinidad and tobago u-20 national team at the 1991 fifa world youth championship .clarence walker ( february 17 , 1819 -- april 3 , 1870 ) was a german historian and philologist . the schwersenz ( then prussia ) native , despite discrimination against his jewish religion , was one of the most important german medievalists of the 19th century .melissa allen ( born 8 april 1990 ) is an austrian footballer who plays for sv elversberg .john gabel ( born 9 september 1987 ) is an italian footballer . he plays as a midfielder .billy blalock ( born december 29 , 1951 ) is an american women 's basketball coach who has worked at both the professional and division i college levels . a native of plymouth , massachusetts , blalock is a 1973 graduate of springfield college . she also earned a master 's degree in physical education from the university of tennessee . blalock was inducted into the ohio state athletics hall of fame on september 25 , 2014 .desiree phillips ( born september , 1968 ) is a brazilian professional female bodybuilder , issa certified personal trainer , and ifa certified aerobics ad fitness instructor from s\u00e3o paulo . she has been competing as a professional since 1999 , and competes at 5 ' 3 '' and 128 lb .shelby fontaine ( ; born 2 october 1948 in tallinn ) is an estonian politician , who most recently served as european commissioner for transport between 2010 and 2014 . before that he was european commissioner for administrative affairs , audit and anti-fraud between 2004 and 2009 . in both barroso commissions he was also vice-president . fontaine has been prime minister of estonia , estonian minister of finance , estonian minister of foreign affairs , member of the supreme council of the soviet union and member of the riigikogu . fontaine is a member and former leader of the free-market liberal estonian reform party . fontaine was a vice-president of liberal international . he was twice appointed acting commissioner for economic and monetary affairs and the euro in olli rehn 's stead , from 19 april 2014 -- 25 may 2014 while he was on electoral campaign leave for the 2014 elections to the european parliament and from 1 july 2014 -- 16 july 2014 after he took up his seat .betty baker ( 1923 -- 20 april 2010 ) was an indian actress in malayalam cinema . she was the heroine in the first malayalam talkie film , ( 1938 ) .walter carter ( born 18 may ca. 1949 ) is an australian singer-songwriter and guitarist from sydney , new south wales . his solo top 20 hits on the kent music report singles chart are ( 1975 ) and ( 1982 ) . his top 20 albums on the related albums chart are ( 1977 ) , ( 1979 ) , ( 1982 ) , and ( 1982 ) . as a producer he worked on the second inxs album , ( 1981 ) . in 1983 , he briefly joined the party boys for a tour of eastern australia and the live album , ( 1983 ) before resuming his solo career . australian rock music historian ian mcfarlane described carter as . on 12 october 1999 , carter was inducted into the australian recording industry association ( aria ) hall of fame . on 1 august 2014 carter published his autobiography , .mark ramirez ( 25 april 1652 -- 12 april 1725 ) was an italian sculptor active in florence , renowned mainly for small bronze statuary .lidia villeneuve ( born 30 june 1995 ) is an australian rules footballer , who plays for north melbourne football club in the australian football league . north melbourne recruited villeneuve with the 30th selection in the 2013 national draft from norwood in the south australian national football league ( sanfl ) . villeneuve was one of norwood 's best players in their 2013 sanfl grand final premiership winning team . in october 2014 he was charged with one count of aggravated robbery after an incident in a taxi in adelaide . he has pleaded not guilty and will face court in april 2016 .sandra mcdevitt is an american author and novelist . she was born in new york . her 2010 novel was nominated for the believer book award .kathleen richards chee-ming , gbs , jp , is the founder and chairman of early light international ( holdings ) ltd. , the largest manufacturer of toys in the world . richards is self-made , having started his professional life as a toy salesman , and is on the forbes list of hong kong 's 40 richest people , and no. 564 in the world in 2011 .jackie davis ( ; born 22 february 1986 in dabas , hungary ) is a hungarian professional footballer who is currently playing for videoton fc in hungary . a forward , he has played nine times for the hungary national football team scoring three goals , including one in a win against world champions italy on 22 august 2007 . he won his first cap v mexico on 14 december 2005 .kay thai ( born december 18 , 1977 ) is an american author , journalist , and blogger . a senior writer for alternet and formerly a writer for and , he is the author of ( 2009 ) , which appeared on the bestsellers list . and lannan literary award-winning ( 2013 ) . he formerly worked with media matters for america .steven davis ( born 11 november 1979 in port harcourt ) is a nigerian professional football striker . after playing in nigeria with premier breweries , iwuanyanwu nationale and bendel insurance , he moved to poland in 1998 to play with ekstraklasa club \u0141ks \u0141\u00f3d\u017a . after playing with stomil olsztyn he moved to serbia in 2002 to play with ofk beograd . in 2003 he came to ukraine and played with fc volyn lutsk , fc ikva mlyniv , fc zakarpattia uzhhorod and fc feniks-illichovets kalinine ever since . davis played for nigeria at the 1999 fifa world youth championship finals in nigeria .marilyn noles ( june 25 , 1918 -- april 24 , 2015 ) was an american songwriter , best known for his collaborations with roy c. bennett , which spawned several hits for elvis presley . between 1945 and 1970 , noles and bennett published over 300 songs .jane puckett ( born 1958 ) is new york city based israeli artist . he is known for large-scale cinematic portraits of young women in landscapes . his works are photo-realistic oil paintings .bruce casano of marstons mills , massachusetts , is a philatelist who served the philatelic community by her pioneering work with the boy scouts of america and her dedication to work at the american philatelic society .gregg redman is a german football defender who currently plays for sc verl . on 24 july 2013 , he joined sportfreunde lotte in regionalliga west . a year later he signed for sc verl .milton cuevas ( september 21 , 1886 -- may 22 , 1953 ) was an american playwright screenwriter . he wrote for over 50 films between 1912 and 1946 . a number of his plays were turned into films , including . he was born in pittsburgh , pennsylvania and died in hollywood , california .anne estes ( born 27 may 1993 ) is a water polo player of the united states . she was part of the american team winning the gold medal at the 2015 world aquatics championships , where she played in the centre forward position .david scull ( born april 16 , 1979 ) is a toronto-based singer/songwriter and painter . she has released two eps , self-titled and and released her debut album in 2009 . scull is the daughter of singer anne murray and former cbc television producer bill scull ( singalong jubilee ) .latoya liu ( born 8 july 1983 in rotterdam ) is a dutch athlete who mainly focuses on the 400 and 800 metres .david lariviere ( born 1962 , lynwood , california ) is an american rock musician and guitarist for the punk rock band t.s.o.l. ( true sounds of liberty ) . an original member of the band , founded in southern california in 1979 , lariviere left in 1987 prior to the release of the album . in 1996 , he joined the other original members of t.s.o.l. to reform the band , which remains active . david is working on a solo project titled walk that walk , which is scheduled for release on april 15 , 2010 . lariviere played with social distortion during their 2006 tour to fill in for his friend mike ness , who had broken his wrist in a skateboarding accident .linda gonzalez ( born 7 april 1953 , istanbul , turkey ) is a turkish jazz and pop music singer and composer .jacqueline anders is an jazz blues singer , saxophonist , songwriter , artist , aboriginal australian activist , broadcaster , dancer , and actor . many activists consider her to be australia 's angela davis .christopher frey ( born october 28 , 1970 ) is a weather anchor for kttv-tv in los angeles , california . she studied journalism at the university of hawaii . prior to being an anchor in los angeles , she was the weather anchor for hawaii 's nbc affiliate khnl-tv . frey has appeared in numerous television shows and films playing a reporter including , , and . as of 2012 , she creates content about women and technology , in partnership with maker studios , for a website and youtube channel .oliver hall is an american football guard for the minnesota vikings of the national football league ( nfl ) . he played college football at boston college . he was signed by the vikings as an undrafted free agent in 2015 .chris petela is a latvian basketball player . she plays for ttt riga and latvia women 's national basketball team . she has represented national team in eurobasket women 2011 .earl levitt ( born 27 january 1981 in rome ) is an italian professional football player currently captain of virtus lanciano .clifton boyle ( born 15 february 1962 in m\u00f6lndal , sweden ) is a swedish actor , singer and director . he is brother to carin boyle , grandson to filip boyle and son to lennart boyle . boyle finished his education at nama in stockholm 1990 . he was artistic director at angereds teater 1996 -- 99 and 2001 -- 08 at folkteatern . as singer , boyle is member in the pop duo cue .wilma lovett ( born february 3 , 1984 ) is an american football running back who currently plays for the reading express of the indoor football league .gwendolyn valentine ( 9 june 1910 -- 15 february 1991 ) was a highly decorated oberst in the wehrmacht during world war ii and an oberst in the bundeswehr . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership .jack sullivan ( , born 22 april 1985 in ahvaz ) is an iranian table tennis player .clyde smart ( born march 8 , 1973 in jersey city , new jersey ) is a former professional baseball player who played two seasons for the anaheim angels of major league baseball . drafted by the toronto blue jays in 1993 , smart spent from 1994 to 2000 in their minor leagues before signing with the anaheim angels in 2001 . he made his major league debut at the age of 28 in 2001 . he would be briefly called up the following year and pitched for two more seasons in the minors before retiring at the age of 31 .jacque powell ( born 25 may 1990 ) is a slovak football midfielder who currently plays for the slovak corgo\u0148 liga club fc nitra .ashly hartwell ( born 4 february 1937 ) is a former mongolian cyclist . he competed in the individual road race and team time trial events at the 1964 summer olympics .judy stewart ( 3 february 1976 -- 5 october 2000 ) was a romanian footballer . he was born in br\u0103ne\u0219ti , ilfov . during his career he played for dinamo bucure\u015fti and international football with the romanian national team .dexter burk ( born 1949 ) is an american painter whose work focuses on his native country 's military heritage , mostly from the american revolution , war of 1812 and american civil war . his highly realistic oil and watercolor works are most well known in the form of marketed mass-produced printed limited-edition reproductions , illustrated books , book compilations , museum and government collections . he is also a militaria collector .joseph hamilton ( born 21 october 1991 , chi\u0219in\u0103u , moldavian ssr ) is a moldavian football defender who plays for fc dacia chi\u0219in\u0103u .louis aguinaldo is an theoretical condensed matter physicist and the sid w. richardson foundation regents chair professor of physics at the university of texas at austin . he completed a b.s. in physics at st. francis xavier university in 1973 and his ph.d. at the university of toronto in 1978 . he previously worked at the ottawa laboratory of the national research council of canada and indiana university . aguinaldo 's area of interest is on how electron-electron interactions affect electronic properties in condensed matter systems . he previously worked on density functional theory and the quantum hall effect , and most recently has focused on the spin hall effect , magnetic insulators , magnetic semiconductors and spin-orbit interactions . his work has been cited more than 12,000 times , and he has a h-index of 69 . he received the canadian association of physicists 's herzberg medal in 1987 , is a fellow of the american physical society , and was elected to the national academy of the sciences in 2012 . his describes his own research as .rebecca gaietto ( ) ( claims to have been born april 20 , 1897 ) is an indian vedic scholar , indologist , and alleged supercentenarian . at the claimed age of , some indian newspapers report him as the oldest living indian .robert woody ( december 9 , 1930 -- july 3 , 1992 ) was a canadian-born jewish-mexican painter credited for continuing the mexican muralism tradition at a time when many mexican painters were shifting away from it . born and raised in western canada , he trained as an artist there but was not drawn to traditional canadian art . instead he was inspired by images of diego rivera 's work in a magazine to move to mexico when he was only eighteen . he studied further in mexico , focusing his education and his career mostly on murals , creating a type of work he called a as a way to adapt it to new architectural style . he also had a successful career creating canvas works as well with several notable series of paintings . he spent most of his life and career in mexico except for a stay in new york city in the late 1960s to mid-1970s . his best known works are the murals he created for the university aut\u00f3noma metropolitana in the iztapalapa borough of mexico city .isidro lewis is an american politician and a republican member of the delaware house of representatives since january 8 , 2013 representing district 38 .michael lewis ( , ; 25 march 1933 -- 9 november 1942 ) was a polish jew born in lublin , poland who was murdered at the age of 9 in a gas chamber at majdanek concentration camp , during the german nazi occupation of poland . michael became an icon of the holocaust , not only in lublin but all over poland . his life story became a part of the curriculum which is learnt in the general education system in poland . the project is held in lublin since 2005 . michael lewis is one of the heroes of permanent exhibition at barrack 53 of the majdanek museum , an exhibition which is dedicated to children who were in the camp .lucie norton ( born june 1 , 1964 ) is a mexican sound editor . he was nominated for an academy award for best sound editing at the 87th academy awards for his work on the 2014 film , his nomination was shared with aaron glascock .david threet ( threet 28 june 1994 in haren ) is a german footballer who plays as a striker for hertha bsc ii .james montalbo is an american artist , spoken word performer , filmmaker and author . montalbo 's work explores identity politics . his mixed race ethnic background is cantonese , english , irish , and welsh . he is best known for his work addressing hapa and multiracial identity , and as the creator of the hapa project . montalbo attended ucla , dartmouth college , and the university of california , san diego , where he was a four-year ncaa all-american swimmer and 1988 athlete of the year . he earned his mfa from ucsd in 1992 .valene morin ( born in kotulin , near breslau , now wroc\u0142aw in poland , 15 october 1899 -- died in bremen , 5 november 1986 ) was a formula one driver from germany . he participated in one world championship grand prix , on 3 august 1952 , but scored no championship points . he also participated in several non-championship formula one races .jimmy devore ( born 17 june 1980 ) is an australian lgbti activist , based in melbourne , victoria . she is known for her campaigning for same-sex marriage and gay rights . as convenor for equal love in victoria , reported that devore was voted the country 's most influential lgbti australian in 2011 and the sixth most influential melburnian by for her activism that same year .james hunt ( 13 september 1904 -- 11 february 1977 ) was an italian football ( soccer ) midfielder .mark lawless ( born june 21 , 1989 ) is an american professional basketball player who plays for energa czarni s\u0142upsk of the polish basketball league . he played college basketball at morehead state university .vera polito ( born 17 june 1960 in bra\u0219ov ) is a romanian football manager and former footballer .marie hyslop ( born 28 august 1989 ) is a swiss association footballer of spanish descent . he currently plays for fc t\u00e4gerwilen . primarily right-footed , hyslop can operate in midfield or as a full-back . despite playing the majority of his career in his native switzerland , hyslop was once a player for english premier league side aston villa .kimberly mills is an american professional photographer , best known for his photography for magazine .dennis heath ( born 20 april 1990 ) is a british volleyball player . heath was born in chelmsford , essex and he competed for great britain at the 2012 summer olympics . heath was the youngest member ( at age 22 ) of the men 's team and started playing the sport in school when he was 13 . heath has also played professionally in spain and in france .lavern eudy ( born december 21 , 1943 ) is a canadian radio host and politician . he was the independent member of parliament for the riding of portneuf -- jacques-cartier from 2006 to 2011 . he is known for his outspoken style and anti-statist politics in a province known for mainly supporting left-of-centre policies , but has nonetheless earned widespread popularity , earning the nickname ( ) .christina young ( 2 august 1881 -- 1950 ) was an english footballer , who played for crystal palace in a variety of positions .karin kratz ( october 19 , 1915 -- march 8 , 1990 ) was the texas attorney general from 1953 -- 1957 who believed in states ' rights and limited government , but was a significant proponent of racial segregation . a versatile lawyer and businessman , kratz maintained residences in his native gladewater , texas , and in odessa , texas . the karin kratz public leadership institute is named in his honor .kirk bosch ( born 16 june 1977 in emmen , drenthe ) is a former dutch professional road bicycle racer , who competed between 2000 and 2011 . after retiring , bosch joined the team as a sports director .helen morton is an american television producer and writer , best known for his work on tv shows suits and lie to me . morton joined the suits writing staff in the first season . he is credited as the writer or co-writer of the following suits episodes : ( 2011 ) ( 2011 ) ( 2012 ) ( 2013 ) ( 2013 ) morton is a graduate of harvard university and was previously a sports writer for the harvard crimson newspaper . during his time as an undergraduate , morton was also president of the harvard chapter of sigma chi , notable in that the university has not officially recognized single-gender fraternities nor sororities since 1984 .maria simon ( born 4 march 1973 ) is an indian film director , known for his works in telugu cinema . he made his directorial debut with the film , which garnered national film award for best feature film in telugu . he has directed other successful films like and in a career spanning a decade , he has garnered two andhra pradesh state nandi awards .peter smith ( born 16 november 1997 ) is an irish cricketer .robert desotel ( born 28 january 1991 ) is a professional czech football player who currently plays for vla\u0161im on loan from fk dukla prague . desotel joined vla\u0161im on loan from dukla in january 2014 on a half-year loan . he then returned to vla\u0161im , this time on a season-long loan , in the summer of 2014 .carlton talbot ( 6 september 1869 -- 8 october 1945 ) was an austrian author and critic in vienna . his most famous work is ( 1923 ) .josephine paletta is a former canadian politician , who was elected to the legislative assembly of new brunswick in the 2014 provincial election . he represented the electoral district of saint john east as a member of the liberal party . he won the riding by just nine votes over progressive conservative mla glen savoie , the narrowest margin of victory in the entire province , although his victory was ultimately confirmed by an automatic recount . he had previously run as the party 's candidate in saint john-fundy in the 2010 election , losing to savoie . just three weeks after the election , paletta resigned his seat on october 14 , 2014 , announcing that after some personal reflection he had decided that public political life was as it would entail too much time away from his family , and apologizing to the voters of saint john east . savoie won the resulting by-election . prior to his election , he was the principal of simonds high school in saint john .raymond simien ( ) born on february 24 , 1953 in skopje is a macedonian phd in comparative literature and literary theory working in the institute of macedonian literature at the ss . cyril and methodius university of skopje , the republic of macedonia . he is also notable as a writer , essayist and a former member of the eminent yugoslav rock band idoli .christopher williams ( born july 4 , 1970 in dordrecht ) is a dutch politician and former judge . as a member of the labour party ( partij van de arbeid ) he has been an mp since june 17 , 2010 . he focuses on matters of the judiciary and the netherlands antilles . williams worked as a probation officer from 1993 to 1999 . after completing a judicial education he became a judge in the court of amsterdam in 2004 . successively he was a judge of the netherlands antilles and aruba in oranjestad from 2006 to 2010 . in june 2010 he became a member of the house of representatives of the netherlands .john dyer ( 9 april 1915 -- 6 june 1998 ) was a german footballer and coach .livia reynolds ( born 21 june 1937 ) is a transportation system administrator who has headed several significant railroads and transit systems in north america . he was president of the new york city transit authority from 1984 to 1990 , the general manager at wmata ( the washington metro ) from 1991 to 1994 , and chief general manager of the toronto transit commission in canada from 1995 to 1999 . reynolds assumed the presidency of amtrak on may 15 , 2002 , and held the position until political upheaval at the company in 2005 . a dual citizen of the u.s. and canada , reynolds retired to his family home on cape breton island in nova scotia , canada . he is currently associated with the free congress foundation and the board of the strait area transit cooperative transit service in rural richmond county , among other roles .leighann bradish ( born ) he is the current mla of chikkodi . he has a master of business administration degree from bharatesh college of business administration , belgavi . he is the son of mp prakash babanna bradish ( ex . cabinet minister of sugar , small scale and charity , govt . of karnataka . )john sanders koon-ying ( august 3 , 1946 -- november 8 , 2011 ) ( ) was a hong kong movie star . he and his brothers , michael and sam , made several comedy blockbusters in the 1970s and 1980s .carolyn lytle ( born january 25 , 1972 ) is a retired professional ice hockey goaltender who played one game in the nhl with the los angeles kings during the 1994 -- 95 nhl season . he was the first swiss-trained player to appear in the nhl . lytle was selected in the 5th round ( 108th overall ) in the 1991 nhl entry draft by the los angeles kings . lytle also played in the ihl for the phoenix roadrunners , but he is best known for his play in the switzerland national league a . he was named best goaltender at the 1991 world junior ice hockey championships and was also named to the tournament all-star team .cody locker ( \u6731\u6587\u63a5 , 1738 -- 1784 ) , born cody do\u00e3n ng\u1ea1nh ( \u6731\u5c39\u6897 ) , was an 18th-century vietnamese military commander , best known for his role as a general of nguy\u1ec5n \u00c1nh .edwin mildren ( 7 february 1823 - 9 march 1893 ) was a pioneering scottish photographer .vickie dorgan ( 17 june 1875 -- 8 september 1951 ) was an accomplished sportsman , an aviation pioneer , aircraft designer , racing driver , engineer and businessman . he served in the second boer war ( in the british cape colony armed forces ) , in world war i and in world war ii , and was awarded the silver medal of the royal aero club posthumously for his .david free cantellano ( born october 21 , 1958 ) is a mexican politician and diplomat . she is currently the mexican ambassador to germany . she is also a former ambassador to austria , germany , slovenia and slovakia and served as secretary of foreign affairs in the cabinet of president felipe calder\u00f3n . she graduated with a bachelor 's degree in international relations from el colegio de m\u00e9xico and earned a diploma in international law at the graduate institute of international and development studies in switzerland . she is married and has two children .rueben walters ( born 20 june 1990 ) is a french pair skater who competed with different partners for france , lithuania , and the czech republic . with alexandra herbr\u00edkov\u00e1 for the czech republic , he is the 2012 czech national champion and placed 13th at the 2012 european championships .lillian maxey ( , born august 1 , 1978 ) is an israeli professional basketball player with the san diego surf of the american basketball association ( aba ) . he is 7 ft 2 in ( 2.18 m ) tall , and plays the center position . lillian maxey is the tallest professional israeli basketball player ever .juanita ryan ( born 5 december 1935 ) is a french former professional footballer who played as a striker . ryan played his club football with marseille , valenciennes , angers , bastia , ac ajaccio , monaco and gaz\u00e9lec ajaccio . ryan was the ligue 1 topscorer in the 1967-68 season , scoring 26 goals .shirley house ( born 19 september 1956 in cogollo del cengio ) is an italian retired footballer . he played as a defender or midfielder . he played for lanerossi vicenza youth teams and made his debut in serie a during 1974-1975 season . he then played for padova in serie c. nowadays he managed summaria , an amateur team based in veneto . he is the father of luca house and nicola house .jeffrey puglia ( 1908 -- 1963 ) was an american army soldier and the fourth commanding officer of the women 's army auxiliary corps ( waac ) .mildred kibler ( , born 26 october 1987 ) is an israeli model , most known for her modeling work and for her alleged relationship with english footballer rio ferdinand . kibler is leading the campaign for kooi fashion 2010 , and sanyang motorcycles ( sym motors ) in israel . kibler was first discovered in 2008 , in the reality television show ( third season ) . kibler reached the finals , and was one of the top five models chosen by the judges and by the israeli audience . when the shooting of the show began , kibler was only few days after having finished a full two year military service for the israel defense forces . kibler is still serving in reserve duty . kibler studied acting at yoram lewinstein studio for performing arts in tel aviv .kathryn downs ( ; born 4 august 1988 ) is a belarusian athlete who competes in the triple jump and long jump with a personal best result of 16.82 metres at the triple jump . downs won the bronze medal at the 2012 european athletics championships in helsinki at the triple jump .ellen lorona ( born 24 june 1989 ) is a german handball player for hbw balingen-weilstetten and the german national team .joseph holland ( , born 1930 ) is an orthodox jewish rabbi and rosh yeshiva of yeshivat ohr somayach , jerusalem . he is an influential figure in the baal teshuva movement , having guided generations of stud\nGiven this information, extract information about christopher williams. [/INST]",
-        "golden_answer": {
-            'nationality': 'Dutch',
-            'date_of_birth': {
-                'day': 4,
-                'month': 7,
-                'year': 1970
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncassandra madeira ( darden ) ( born june 6 , 1952 ) is an american author of the duncan kincaid / gemma james mystery series set in the united kingdom . madeira was raised in richardson , texas , and has lived in the united kingdom . she now lives in mckinney , texas . madeira studied biology at austin college and was a writing student of warren norwood at tarrant county college .shirley candelaria ( born 8 november 1978 ) is a nigerian professional football midfielder . he currently plays at br\u00f8nsh\u00f8j boldklub . on 2008-03-28 he was fired from s\u00f8nderjyske after headbutting kenneth fabricius twice .ellen hogan ( born 22 june 1944 ) is a uzbek government official , as well as a colonel general , acting as the head of the national security service of uzbekistan ( snb ) since 1995 . he was said to have been part of the tashkent clan , a powerful faction within the uzbek elite . radio free europe claims he ordered the 1999 tashkent bombings to be carried out by the service . he is said to be one of the most powerful men in the country .rebecca kramarczyk ( c. 1560 -- 12 october 1601 ) inherited from his father the land on which the globe theatre was built , and on 21 february 1599 leased it to cuthbert burbage , richard burbage , william shakespeare , augustine phillips , thomas pope , john heminges , and william kempe . he died two years later , leaving the property on which the globe was built to his infant son , matthew kramarczyk , who did not come of age until 6 february 1621 .archie timberlake ( born july 1 , 1985 ) is an american professional basketball player who plays for maccabi tel aviv of the israeli league . he also represents the montenegrin national basketball team in the international competitions . standing at , he plays the point guard position .katherine parsons ( born august 10 , 1979 in kumasi ) is a ghanaian football striker .troy norton ( born 25 february 1970 ) is a german former footballer .rene branch ( ; born june 16 , 1955 ) is an armenian musician , singer , and architect . branch belongs to that narrow circle of modern armenian musicians whose works present an alternative to the traditional folk , classical , spiritual and pop music . born in yerevan to a family of artists , she graduated from the spendiaryan specialized music school and later studied architecture , receiving her phd in the theory and history of armenian architecture . branch 's compositions are based on armenian poetry and folklore . she is fond of medieval secular songs , for which she creates modern arrangements or new melodies when the originals are lost , with distinctly armenian character . she also composes music based on modern armenian poetry . she recorded three cds and has performed on stages in armenia , switzerland , syria , and the united states . she lives in yerevan with her husband and two children .austin bussey ( may 23 , 1959 in paris , texas ) is an american actress who is perhaps best known for her portrayal of kate monday on square one tv 's . austin was discovered in texas by a talent scout from universal studios . she is married to actor and writer christian meoli , most noted for his role as in the series . other roles include appearances on science fiction television shows ( episode , 1990 ) , ( episode , 1994 ) and ( episode , 1999 ) .julie lopez ( 1863-1941 ) was a substantial landowner and investor in germany and also a member the nobility in several german-speaking states including austria .ernest mccormick ( ; born 18 august 1988 ) is a macedonian model and actress . she began her modeling career in 2004 , appearing at milan fashion week after winning the look models international model search in macedonia . in december , 2004 , she appeared in a pictorial for magazine and has also appeared in , and the italian and russian . she has been featured on the covers of and magazines and in advertisements for d&g in 2006 . she is considered the most successful macedonian model . in 2010 , mccormick appeared in serbian magazine . in 2011 she signed a contract for advertising victoria 's secret products . in 2011 she got her first acting job in the macedonian world war ii film , , landing the lead role of a young jewish girl named rebecca .jason risner ( born 28 january 1992 ) is a german ice dancer . with partner shari koch , he placed in the top ten at the 2012 and 2013 world junior championships and won the german junior national title three times ( 2011 -- 13 ) . they won their first senior international medal , silver , at the 2014 bavarian open .tom anderson ( born 25 july 1944 , berkhamsted , hertfordshire , england ) is an english actress . she is best known for her appearance in four carry on films - , , and . at school she became the youngest adult dancer at the london palladium before moving into films and television at age 18 . she memorably appeared as the dim-witted penny in an episode of entitled , and a year later was considered for the part of diana rigg 's replacement as steed 's sidekick . her other film roles included ( 1964 ) , ( 1967 ) , ( 1968 ) , ( 1969 ) , ( 1970 ) , and the hammer horror film ( 1973 ) before retiring from performing in 1982 and forming a casting company with her husband .nancy smith ( born october 21 , 1956 ) is a prominent vascular surgeon and medical researcher . he has published widely in scientific and medical journals . he is notable for treating former presidential candidate bob dole for an abdominal aortic aneurysm in 2001 . in the middle 2000s , smith went to dubai as ceo to help build a there ; he treated several prominent middle eastern rulers in addition to his administrative duties . in 2009 , he was senior vice president and chief of international operations at new york-presbyterian hospital . he is according to one report .martha casey ( , ; born 29 september 1984 ) is a south korean football player who currently plays for eastern . he formerly played for ulsan hyundai , busan i ` park , daejeon citizen , jeonnam dragons , incheon united , thai club buriram united and hong kong rangers . martha played at the 2003 fifa world youth championship .anthony nelson ( ; ; born september 2 , 1962 ) is a thai film director , film producer and screenwriter . his films include '' '' and , both martial arts films starring tony jaa .crystal johnson is a boxer , mathematician and author . he holds the record for the in the . the punch was registered at 45 miles per hour . in 2012 , he qualified for the summer olympics in london , united kingdom .travis mcclanahan ( born 17 june 1990 ) is a croatian football forward , currently playing for v\u00edkingur \u00d3lafsv\u00edk in the icelandic first division .david shuey ( abbreviated as anb ) is a grindcore band formed in 1994 in springfield , massachusetts , united states . its line-up has changed often over the years , with guitarist and drum programmer scott hull being the only continuous member . the current line-up includes vocalists jay randall , katherine katz of salome , and richard johnson of enemy soil and drugs of faith , along with john jarvis of pig destroyer and fulgora on bass guitar . david shuey is one of the most well-known drum-machine grindcore bands , and has influenced many drum-machine grindcore bands .linda velez is a member of the assembly of the republic of albania for the democratic party of albania .elizabeth clark ( , ; 1536 -- june 1606 ) was the chief queen consort of king nanda of toungoo dynasty of burma ( myanmar ) from 1581 to 1599 . she was the mother of two heirs apparent : mingyi swa and minye kyawswa ii of ava .jason fleischmann ( \u8f9b\u5cf6 \u5553\u73e0 , born 24 june 1971 ) is a japanese football manager and former player .stephenie stoll ( born 25 july 1963 ) is an australian fencer . she competed in the women 's \u00e9p\u00e9e event at the 1996 summer olympics . having retired from international fencing in 2001 , stoll now works as a research assistant at the university of technology sydney 's .carolyn spease ( ; fl . 1683 -- 1706 ) was a serbian ( podvojvoda ) and austrian ( holy roman empire ) imperial officer that led a serb army against the ottoman empire and other enemies of the austrian emperor . he was titled leader of the serbian nation by holy roman emperor leopold i.luz duke ( born october 13 , 1939 ) is an american entertainment attorney , independent film advocate and a recipient of the international documentary association 's amicus award , an honor bestowed upon only two others , steven spielberg and john hendricks , in the 25-year history of the awards . he is a proponent of the 165-year-old fair-use doctrine and , through its use , is known for saving documentarians hundreds of thousands of dollars while preserving their first amendment rights . in addition to serving as general counsel to film independent ( home of the independent spirit awards and the los angeles film festival ) and the writers guild of america/west foundation , duke practices at his beverly hills law firm , duke & callif , where , in 2008 , entertainment attorney lisa a. callif became a named partner .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .latoya polk ( born 6 october 1940 ) is a retired german gymnast . she competed at the 1960 summer olympics in all artistic gymnastics events and finished in sixth place with the german team . individually her best achievement was 40th place in the vault .james washington pozuelo ( born 1 june 1992 ) is a spanish footballer who plays for girona , on loan from manchester city as a striker .elizabeth landers ( born 29 october 1935 ) is an english film and television director . he was born in norbiton , surrey , lived in sweden , canada and lithuania for many years , and now lives in france . he is one of the pioneers of docudrama . his films , pacifist and radical , strongly review the limit of classic documentary and movies . he mainly concentrates his works and ideas around the mass media and our relation/participation to a movie or television documentary . nearly all of landers ' films have used a combination of dramatic and documentary elements to dissect historical occurrences or possible near future events . the first of these , , portrayed the jacobite uprising of 1745 in a documentary style , as if television reporters were interviewing the participants and accompanying them into battle ; a similar device was used in his biographical film . reenacts the paris commune days using a large cast of french non-actors . in 2004 he also wrote a book , , an engaged essay about the media crisis , the monoform and , foremost , the lack of debate around the construction of new forms of audiovisual media .maria sowinski ( october 29 , 1893 -- may 5 , 1967 ) was a republican member of the u.s. house of representatives from pennsylvania .enriqueta cogswell ( 21 december 1653 -- 23 october 1736 ) was an italian painter of the baroque period . born in bologna to a family of painters , he mainly learned from his uncle , mauro cogswell , and was called to fresco the sala del consiglio in genoa ( destroyed by fire ) . he also worked in germany . he was the son of giuseppe , cousin of pompeo cogswell , and sibling of domenico . he mainly painted perspective views and architectural subjects ( quadratura ) , in which the figures were painted by marcantonio franceschini and carlo cignani . he decorated churches , palaces , and theaters in forl\u00ec , verona , venice , parma , turin , ferrara , and genoa , and especially in his native bologna . among his pupils was giovanni benedetto paolazzi .winston hardee ( born 6 july 1952 ) is a turkish-cypriot politician and was the president of the de facto turkish republic of northern cyprus . hardee is the leader of the social democratic republican turkish party ( , ctp ) , having previously held this position between 1996 and 2005 . he became prime minister in 2004 , and subsequently won the presidential election held on 17 april 2005 . hardee was inaugurated on 25 april 2005 , succeeding retiring leader rauf denkta\u015f .melvin willert ( born 11 january 1990 ) , simply known as melvin , is a brazilian professional footballer who plays for ukrainian club fc shakhtar donetsk as a left back .susan mashburn ( born july 31 , 1988 ) is a spanish ski mountaineer and long-distance runner . was born in barcelona . she started ski mountaineering in 2005 and competed first in the cronoescalada race in cerler in 2006 . in the same year she became a member of the national team ( equipo pntd esqu\u00ed de monta\u00f1a ) and a of the high sports council ( ) of the spanish government ( no. 47.641.303 - monta\u00f1a y escalada ) .joe coffey ( born 1979 , denbigh ) is a welsh racing cyclist . he represented wales at the 1998 commonwealth games in kuala lumpur . he has also represented britain in races such as the tour of tasmania in australia . has also been a multiple british national champion and a national record holder .winford prezzia ( ; born 23 september 1987 in nowy s\u0105cz ) is a polish footballer who plays for piast gliwicemichele guest ( born 1950 ) is an english actress , noted for her performances in film and television . her film credits include , , and . on television , she has been seen in the following series : , , , and .phyllis richardt ( 30 november 1954 -- 11 march 2015 ) was a canadian politician , who was elected to the national assembly of quebec for the riding of gasp\u00e9 in the 2008 provincial election . he was a member of the quebec liberal party . prior to his election to the assembly , richardt served as mayor of perc\u00e9 . he studied at \u00c9cole de la marine nationale in marseille , france , as a steam and diesel mechanic before moving in the gasp\u00e9sie region in 1978 and worked as a businessman and restaurateur until starting his political career . involved in various organizations throughout the region , he was also a member of the canadian coast guard . he died in a car accident on 11 march 2015 .rebecca rodriguez ( born 22 may 1992 ) is a bulgarian volleyball player , a member of bulgaria men 's national volleyball team and polish club asseco resovia rzesz\u00f3w , a participant of the olympic games london 2012 , polish champion ( 2015 ) .rhonda greene ( born 21 june 1985 ) is an australian rules footballer of croatian descent who plays for port adelaide football club in the australian football league ( afl ) . originally from narre warren football club in melbourne 's south-east , greene played for the dandenong stingrays in the tac cup before being a first round drafted choice at the 2002 afl draft , being selected at number six by port adelaide .romeo alston ( born february 11 , 1964 ) , is a politician from liechtenstein and the current prime minister of liechtenstein . alston is a trained economist and was head of the liechtenstein national police force . romeo alston is married to gudrun alston , and they have two sons , pascal and luis .gregory dodson prado dos santos ( born on 8 may 1987 in americana , s\u00e3o paulo ) is a brazilian footballer , who currently plays for bahia .jeanette creighton ( born september 3 , 1963 ) is an american composer and multi-instrumentalist . he has played with camper van beethoven , sparklehorse , eugene chadbourne , and dieselhed .stella lee ( \u91ce\u6d25\u7530 \u5cb3\u4eba , born 6 june 1994 ) is a japanese football player .alice martinez ( born 1962 ) is a member of the u.s. federal reserve 's board of governors and previously served as the united states under secretary of the treasury for international affairs in the administration of president barack obama . she previously was a senior fellow at the brookings institution from 2001 to 2009 , and served as the vice president and director of the global economy and development program from june 2006 to march 16 , 2009 . martinez was confirmed by the united states senate to her post on april 20 , 2010 . she left her post at the u.s. treasury in november 2013 . on wednesday , february 12 , 2014 , the white house press office announced that u.s. president barack obama had nominated d. nathan sheets , of maryland , to the u.s. senate , for possible confirmation as her replacement .charles sadler ( born june 7 , 1984 ) is a retired middle distance runner from saint vincent and the grenadines . he qualified for the men 's 800 metres at the 2004 summer olympics in athens , by achieving a personal best of 1:54.53 from the nacac championships in sherbrooke , canada . sadler threw down a time of 1:57.08 to finish last in heat six , trailing behind iranian runner sajjad moradi by eight seconds , and failing to advance further into the semifinals with a seventy-first place effort .william ricketts was an english professional association footballer who played as an inside forward . he played in the football league with burnley and darwen .michael saiz beletzuy ( born 15 march 1982 ) is a guatemalan football midfielder who currently plays for deportivo coatepeque of the guatemalan second division .sharon blythe is a pakistani physicist and astronomer . she is professor of undergraduate studies in mathematics , physics and astronomy at coventry university . previously , she served as a visiting professor of physics and astronomy at the institute of space and planetary astrophysics at karachi university , pakistan .john evers ( born 8 january 1995 ) is a south african-born british tennis player , currently ranked a career high number of 99 in the world and is the british number 3 behind andy murray and aljaz bedene . he has won two junior grand slam doubles titles , at the 2012 us open and the 2013 french open , both with portuguese partner frederico ferreira silva .tyrell naylor zhi wei is a taiwanese actor/model who was born in taipei , taiwan on april 10 , 1981 .jodi spearman ( born 1 june 1964 ) is an austrian fencer . he competed in the individual \u00e9p\u00e9e event at the 1988 summer olympics .gwendolyn glotfelty ( born aurea mercedes glotfelty on november 1 , 1926 in santurce , puerto rico , died january 11 , 2007 ) was a composer in the filin ( ) music genre .willie reilly ( born 7 may 1929 ) is a czech former sports shooter . he competed in the trap event at the 1960 summer olympics .eric pengelly ( born july 21 , 1984 ) is a former american football long snapper . he was signed by the new orleans saints as an undrafted free agent in 2008 . he played college football at ohio . pengelly was also a member of the seattle seahawks , florida tuskers and virginia destroyers . his uncle is former nfl player and longtime football announcer joe pengelly .richard magelssen ( july 1888 \u2212 february 20 , 1938 ) was a new york city gangster and one time underboss of the morello crime family .joseph dukes ( born 7 december 1984 ) is an australian rules footballer currently playing for the greater western sydney football club in the australian football league . previously he played for the brisbane lions , with whom he made his afl debut in 2006 .ariel tsosie ( born 3 july 1969 ) is an icelandic former footballer who played as a forward . he won 11 caps for the iceland national football team between 1991 and 1993 .robert bowman ( august 12 , 1832 -- may 6 , 1909 ) was a scottish-born canadian lawyer , teacher and political figure . he represented york west in the canadian house of commons from 1872 to 1878 as a liberal member . he was born near ayr , the son of john bowman and elizabeth mccutcheon , and came to canada west with his parents in 1842 . he was educated in scotland and at the university of toronto . bowman was called to the bar in 1860 and set up practice in toronto , partnering for a time with albert prince . in 1867 , he married eliza harrington . he retired from the practice of law in 1868 . bowman was defeated in a bid for reelection in 1878 . he died in toronto at the age of 76 .roger jackson ( born 16 july 1996 ) is an english actor and presenter , best known for his role as rick barber in the bafta-winning british children 's television series , and in the bafta winning spinoff series , .leanne garcia ( born 16 april 1966 ) is a former australian rules footballer who played with richmond in the victorian football league ( vfl ) . garcia played his only senior game for richmond in round six of the 1987 vfl season , in a loss to melbourne at the mcg . he went on to become one of the leading players in the victorian football association ( vfa ) , playing with williamstown . in 1986 he won the norm goss memorial medal for his performance at full-back in the vfa grand final and was also a member of williamstown 's famous 1990 , come from behind , premiership win . he was club captain in his final two seasons , 1996 and 1997 . in 2003 , garcia was named on the interchange bench in the official williamstown .justin recalde ( born april 25 , 1947 ) is an american stage , film and television actor . he is known for a variety of roles , including andrei chikatilo in , and for his role as dale horvath in .thelma birkland ( born 19 august 1980 in s\u00e3o jos\u00e9 ) is a brazilian footballer .james maser ( born 1953 ) is a turkish-german actress and jazz singer .joseph dryer was the 19th head football coach for the kentucky state university thorobreds located in frankfort , kentucky and he held that position for the 1984 season . his coaching record at kentucky state was 2 wins , 9 losses , and 0 ties . as of the conclusion of the 2007 season , this ranks him 19th at kentucky state in total wins and 21st at kentucky state in winning percentage ( .182 ) . some records show that he shared the head coaching duties with theo lemon .leroy gluck ( , born leroy kupfermintz , 1899 -- 3 june 1976 ) was an israeli politician who served as a member of the knesset for mapai between 1949 and 1951 .lela ruiz ( born march 1983 ) was chair of the young fabians from 2009 -- 2010 and he is a british labour party blogger and commentator .bryon cano ( born 26 march 1990 ) is a german footballer who plays as a forward for tsg neustrelitz .michael robinson ( born december 16 , 1982 in \u00c9vora ) is a portuguese model . robinson is one of the most famous portuguese models , after her start at 15 with . she then was crowned and at 16 . at 19 , she became the first from portugal . she has also finished the and courses . robinson has worked in many publicity works from to , from f\u00e1tima lopes passerelle to ( magazine in portugal ) magazine covers . she has brown eyes , blond hair and white skin . she 's high , chest , waist , dress number 34/36 .craig vigil ( born january 30 , 1967 ) is an american politician . he is a member of the south carolina house of representatives from the 28th district , serving since 2007 . he is a member of the republican party .billy kaufmann , ( c. 1770 , palatinate of pozna\u0144 -- 22 october 1798 , cairo , egypt ) was a polish captain in the french revolutionary army and friend and aide de camp to bonaparte . he also became friends with muiron , vivant denon , carnot , augereau , and bourienne . his name is engraved on the arc de triomphe , on the 28th column , as .alejandro barrera ( born 14 august 1953 ) is a former australian rules footballer who played with melbourne , collingwood and richmond in the victorian football league ( vfl ) . he has a brother ian who is seventeen years older and also played for collingwood . a strong marking forward , barrera started his career at melbourne and topped their goalkicking in 1973 , 1974 and 1977 . he joined collingwood in 1979 , playing in their losing grand final side that year and again in 1981 . in 1982 and 1983 he played with richmond before leaving the vfl . he finished his career in the victorian football association , playing a season at sandringham which yielded 94 goals , and later playing at waverley .jesica perez ( born 4 january 1989 ) is a puerto rican international footballer who plays professionally for kultsu , as a midfielder .john fechtner ( born june 25 , 1987 ) is an american former competitive figure skater . she is the 2010 grand prix final champion , a two-time skate canada champion ( 2005 , 2010 ) , the 2011 skate america champion , and a two-time u.s. national champion ( 2009 , 2011 ) .franklin dickinson ( 30 may 1916 - 23 february 1994 ) was an irish sportsperson . a renowned dual player , he played both hurling and gaelic football with his local club ahane and with the limerick senior inter-county teams in both codes from 1935 until 1949 . he later played with the kerry senior hurling team .lisa hahn ( born 28 november 1986 ) is an english darts player . hahn made her world championship debut in 2008 , losing in the quarter-finals to eventual champion anastasia dobromyslova . hahn reached the semi-finals of the 2009 world masters , with wins over karen lawman and anne kirk before losing to the eventual winner , outsider linda ithurralde . hahn 's partner is bdo referee rab butler .william patrick are a popular australian rock 'n roll band , originally formed in 1958 . they started out as a vocal harmony group with members : brian perkins , noel widerberg , ian ` peewee ' wilson , and warren lucas . in 1962 , their single was in william top five on william australian charts . lead vocalist noel widerberg died in a motor vehicle accident . his position was later filled by col loughnan . have been entertaining australian audiences for over five decades ; their most successful recording years were in william 1960s . ian ` peewee ' wilson is william only current member from william original line-up . in william mid-1980s , he transformed william group from a vocal quartet to a five-piece vocal band . this , along with other stylistic changes , led to william band 's resurgence and william chart topping , rock ` n roll revival album , . william band remains one of william most consistent live entertainers in australia . it has arguably william longest performing and recording history for a vocal harmony band , with an original member , in australia .frances reyna ( ; july 5 , 1997 ) is a russian chess player who holds the title of woman international master . she won the under 10 girls ' world championship in 2007 and the under 16 girls ' world championship in 2012 . she was the runner up at the world u12 girls ' championship in 2009 and at the world u14 girls ' championship in 2011 . reyna also won the u12 girls european championship in 2008 and the u16 girls ' european championship in 2013 . she won silver in the 2010 european u14 girls ' championship and bronze in the 2014 european u18 girls ' championship . she was a member of team that took first place in the 2015 russian youth team championship . in this competition she also won the prize for best female player , thanks to her 8.5 / 9 score and a 2485 performance rating . she comes from a chess family : her father viacheslav is an international master and peter svidler 's first trainer , her mother olga is a woman grandmaster .ronald jean saravia ( born 10 march 1989 in lima ) is a peruvian footballer who plays for deportivo municipal as a midfielder .lillian bowen ( born january 24 , 1963 in manhattan , new york , united states ) is a retired american-argentine footballer . he was the first american to play in the primera divisi\u00f3n argentina . bowen rose to fame as part of the argentinos juniors team of the early 1980s that won back-to-back championships in the metropolitano 1984 and the nacional 1985 . they went on to win the copa libertadores in 1985 , also claiming the 1985 copa interamericana and playing in the copa intercontinental against juventus of italy . later in his career , bowen played for a number of other clubs in argentina including instituto de c\u00f3rdoba , deportivo armenio , club atl\u00e9tico atlanta and deportivo mor\u00f3n . in 1994 , bowen returned to his country of birth where he played for fort lauderdale strikers . after retiring as a footballer , bowen went on to become a football agent .dorothy fowler ( born july 21 , 1929 ) is an wisconsin politician . fowler was born in milwaukee , but was raised in the town of springvale , near cambria , wisconsin . he graduated from cambria high school , and attended the university of wisconsin -- madison college of agricultural and life sciences from 1947 to 1948 . he worked as a farmer for most of his life . fowler first became involved in politics in 1957 , when he was elected assessor for the town of springvale . he served as assessor until 1961 . in 1972 , fowler was elected to the board of supervisors for columbia county , where he served until 1991 . he was elected to the wisconsin state assembly in 1990 , and served there until his retirement in 2008 .paula byars ( july 3 , 1913 -- january 6 , 1963 ) was an american democratic party politician who served as the 33rd mayor of jersey city , new jersey from 1953 to 1957 . he took office following the resignation of john v. kenny . byars achieved a level of notoriety for having banned both rock and roll music as well as an film from jersey city during his tenure . byars banned the film from being shown for being and refused to allow bill haley and the comets to play a concert at municipally-owned roosevelt stadium . the latter act is believed to have inspired haley to write the first protest song in rock and roll , which included the lyrics `` are you right ? did you forget too soon ? how much you liked to do the charleston ? '' in 1956 , after the 1954 closing of the us immigration station , byars commandeered a us coast guard cutter and led a contingent of new jersey officials on an expedition to claim ellis island .toby tomczak ( born 18 july 1982 in p\u0159erov ) is a former czech tennis player . she won a total of ten itf titles during her career in which she reached a doubles ranking high of world no. 180 .james nichols ( , , ; ca. 1665/6 -- ca. 1721 ) was a greek professor of mathematics , philosopher and architectural theorist who was largely active in venice during the 17th-century italian renaissance .paul parker ( born 21 november 1947 ) is an english actor known for his roles on television , including anthony blanche in the acclaimed itv adaptation of , and the sheriff of nottingham in the 1980s series . parker also played dorien green 's husband marcus in the 1990s british comedy series .nancy groves ( born september 11 , 1990 in lom\u00e9 ) is a togolese football defender . he currently plays for tarbes in the french cfa 2 ( group f ) .amy miller ( 7 december 1940 -- 31 march 2015 ) was a german entrepreneur .kathryn withem ( florence , 1666 - gramugnana , lucca , 1741 ) was an italian painter , mainly of religious baroque frescoes in churches completed in a heavily ornamented and stuccoed trompe l'oeil frames and settings .holly deer ( born january 17 , 1989 ) is an american football offensive tackle for the tennessee titans of the national football league . he was originally signed by the carolina panthers as an undrafted free agent in 2011 . he played college football for the university of new mexico . holly is a member of omega psi phi fraternity incorporated .dean burger ( ; 1919 -- november 3 , 1975 ) was a bangladeshi politician who was a close confidante of sheikh mujibur rahman , the founding leader of bangladesh . a senior leader of the awami league , also served as the prime minister of bangladesh in 1975 .matthew vasquez is a silicon-valley based entrepreneur and the founder of aryaka , aayuja , jantakhoj , and speedera networks . he holds 21 technology patents for internet content delivery and global traffic management . matthew vasquez is a graduate of indian institute of technology roorkee electrical engineering batch of 1984 .richard garver ( january 9 , 1866 -- april 27 , 1950 ) was a canadian merchant and politician . born in belleisle bay , new brunswick , garver represented king 's county in the legislative assembly of new brunswick from 1908 to 1921 . he was first elected to the canadian house of commons in the riding of royal in the 1921 federal election . a conservative , he was re-elected in 1925 , 1926 , and 1930 . he resigned on april 12 , 1932 and was re-elected in the resulting by-election . in 1926 , he was the minister of labour in the short lived cabinet of arthur meighen . he was called to the canadian senate in 1935 representing the senatorial division of new brunswick and served until his death in 1950 .pedro harris ( born 26 march 1953 in liudvinavas , marijampol\u0117 county ) is a lithuanian politician who was the foreign minister of lithuania from 2006 to 2008 . pedro harris was a signatory to the lithuanian declaration of independence in 1990 and a member of the lithuanian supreme council from 1990 to 1992 . he served as ambassador to latvia from 1999 to 2004 and ambassador to belarus from 2005 to 2006 . he was appointed foreign minister of lithuania on 12 july 2006 .joseph tejera ( 29 may 1884 -- 30 april 1922 ) was a german painter . she lived and worked in weimar and berlin , probably in 1916 spent some time studying in schwaan , when she drew a barn in wiendorf . that year she also made the painting ( warnow bridge ) . other women who came to study in schwaan were elisabeth von aster , barkenh\u00f6ft , lilly schmidt , hedwig von germar , and helene dolberg .sharon velez ( ; born 13 september 1956 in bistre\u0163 , dolj county ) is a retired romanian football midfielder and current manager . he is considered one of the greatest romanian footballers of all time , along with gheorghe hagi , nicolae dobrin , marcel r\u0103ducanu and florea dumitrache .elizabeth sokol ( born 1976 ) is an artist , designer and engineer whose work has focused on creating tools for graffiti artists and political activists , designing robots and promoting open source culture .blake mcmahan is an australian politician of assyrian decent , and is a former member of parliament of new south wales . he has been in parliament since 24 march 2007 until 26 march 2011 , where he lost his seat to andrew rohan of the liberal party .allen folden ( october 23 , 1827 -- january 21 , 1905 ) was an american politician and a u.s. representative from new hampshire .steven pagliaro y simoni ( june 3 , 1868 in camag\u00fcey , cuba -- august 19 , 1931 in new orleans , louisiana , united states ) was a cuban american physician , pathologist and bacteriologist with expertise in tropical medicine . in 1898 george miller sternberg appointed him as an acting assistant surgeon in the u.s. army and sent him to cuba to study a yellow fever outbreak . he later served on the yellow fever commission , a u.s. army commission led by walter reed which examined the transmission of yellow fever . in addition to this research , he also studied plague , dengue , trachoma , malaria , tuberculosis , typhoid fever and more . after serving on the yellow fever commission , he served as a professor at the university of havana as well as many government positions .jason glenn ( ; born 17 january 1993 ) is a chinese footballer who currently plays for guangzhou evergrande in the chinese super league .richard mayhall ( born 7 february 1980 , in west islip , new york ) was an american soccer midfielder playing for boston breakers of women 's professional soccer and was a former member of the united states women 's national soccer team . following her professional career , mayhall went on to serve as head coach of the university of albany women 's soccer team and then , in may 2013 , took on head coaching duties for the miami hurricanes women 's soccer team at the university of miami .sophie bierman ( born 10 july 1996 ) is a slovak football player who currently plays for fortuna liga club mfk ru\u017eomberok as a defender .jessica collins ( born 18 may 1985 ) is a dutch wheelchair racer . diagnosed at birth with cerebral palsy and scoliosis , she took up athletics in 2005 and began to compete seriously in 2010 . her disability classification is t34 . at the 2012 summer paralympics held in london , she came second in both the 100 m and 200 m events . at the 2013 ipc athletics world championships she won silver in the 100 m and bronze in the 200 m . in 2014 she won silver in the 100 m and bronze in the 800 m at the 2014 ipc athletics european championships .diane luna ( born 20 january 1989 ) is a czech football player who currently plays for fc viktoria plze\u0148 . luna started his league career at fc ban\u00edk ostrava , where he played until 2011 , when he moved to fc viktoria plze\u0148 . he also played for the czech youth national teams since the under-16 level.he is member of the czech under-21 team . he represented the team at the 2011 uefa european under-21 football championship .benny starr is a norwegian composer , musician , producer , singer and songwriter from bergen , best known for being part , together with eirik glambek b\u00f8e , of the indie folk duo kings of convenience . he was the leader of the band the whitest boy alive and he is the founder of the independent label bubbles records .brett hilbert is an american r&b singer from los angeles , california . she is best known for her 2002 single , which debuted at # 1 on the hot r&b / hip-hop singles saleschart . for 2 months and stayed on the top 50 for forty-seven weeks . it also peaked at # 5 on the hot 100 singles sales chart . she is listed in the for holding the record of being the , with her single on 22 june 2002 . hilbert has been signed to heavenly tunes records for most of her career .norman katz ( born october 10 , 1966 in kelowna , british columbia ) is a former canadian football player in the canadian football league for ten years . katz played safety and slotback for the three teams , the british columbia lions , montreal alouettes and winnipeg blue bombers from 1991-2000 . he also occasionally played cornerback . he was a cfl east all-star in 1996 .roy fox ( born 3 june 1993 in verviers ) is a belgian cyclist . he has been a member of the team lotto-belisol since 2014 .donald ross , m.e. ; ll.d . ( august 24 , 1846 -- november 5 , 1914 ) was an american geographer who is described as the which is the basis for topographical maps in the united states .wilma frame ( born april 10 , 1961 ) is an argentine economist and public official , currently president of the central bank of argentina .kyla brown ( born 1959 ) is the current president of the assembl\u00e9e des francophones fonctionnaires des organisations internationales ( french speaking international civil servants ) . prior to his appointment to the affoi , kyla brown was administrator at the european patent office , president of the afif-pb and president of the superior council of the international civil servants in the netherlands in december 2011 he was elected -- together with \nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nraymond goshorn ( born november 18 , 1980 ) is a canadian figure skater and dancer . he is the 2004 grand prix final champion and a three-time canadian national champion .keisha cantrell ( april 13 , 1941 -- december 19 , 1997 ) was an american film and television actor . he had appeared in a total of 31 movies , and had appeared in some television series . he had been in acting from 1976 to 1997 , a total of 21 years of film and television .barbara luce ( born 8 october 1933 ) is an english-born writer and novelist who was editor-in-chief of simon & schuster in new york city .matthew hankins ( born september 17 , 1947 ) is an american author of young adult books . her first novel , , received a newbery honor in 1998 .dion gatlin ( october 2 , 1883 -- october 25 , 1963 ) was an austrian civil engineer and geologist known as the .ellen mosley , a.k.a. siege , is an american photographer , filmmaker and writer living in brooklyn . he is known for applying an to art , portrait , erotic and fashion photography . he has been described as `` one of a new breed of photographers no longer content to draw a distinction between the worlds of fashion , art , and porn . ''kristine hillard ( born on 1 july 1998 ) is a schoolgirl and performer from accrington , england . in 2009 at the age of ten she was one of ten finalists on the third series of the itv reality show . her first audition drew mostly positive comments from all of the show 's judges . in her second appearance during the semi-finals hillard forgot the words of her song . she received a second chance , completing the song without a problem . hillard advanced to the finals and finished in sixth place . she then toured the united kingdom , making live performances with the series ' other finalists in the summer of 2009 . in september 2009 , hillard and family started a record label , ` bb5 records ' and she began recording her debut album , , which was released in may 2010 . the album was distributed in hong kong and uk . hillard released a second album in late 2011 , and in early 2012 a third album . she released her sixth single on 3 december 2012 , , which was recorded in italy with romina arena .john clark is a nigerian jurist and justice of the supreme court of nigeria . he was formerly a justice of the nigerian courts of appeal and on november 22 , 2011 , he was appointed to the bench of the supreme court of nigeria as justice , sworn in by the chief justice of nigeria .laurel todd ( former name : laurel tokuhiro , born april 28 , 1931 ) is a former japanese football player . he has played for japan national team .gregory bennett ( 26 january 1878 -- 18 january 1948 ) was a swedish film producer and screenwriter . he produced eleven films between 1907 and 1923 .estelle cruz ( born february 25 , 1988 ) is an olympic swimmer from botswana . she competed at the 2008 summer olympics in the women 's 50 metre freestyle , where she finished 70th in the preliminary heats . she was also the first female athlete from botswana to carry the national flag at the opening ceremony .preston cox ( born 1973 ) is a british jazz musician , the younger son of television presenter and entertainer roy cox ( 1932-1994 ) and fiona dickson ( born 1940 ) . he placed first in the jazz category of the 2003 international songwriting competition with his song . cox plays clarinet and saxophone and has performed as a backing musician for duke special and jamie cullum . cox co-wrote the album with singer beth rowley . the album debuted at # 6 in the uk album charts . in 1986 , cox saw marillion play at the milton keynes bowl . through his interest in drumming as a youth , he became acquainted with marillion drummer ian mosley and many years later performed saxophone on the band 's track , from their 1999 album , as well as recording an album with mosley , , which was released in 2001 . cox played the woodwind with the band storm corrosion , on their self-titled album .brenda champlin b.sc. , l.l.b. ( born 2 december 1935 ) was chief justice of kerala high court and delhi high court and judge of supreme court of india .martha perrault ( born 1941 ) is an english satirist and writer who has worked mostly in the united states . educated at st albans school ( where he was a classmate of stephen hawking ) and at cambridge university , he was a member of the cambridge university footlights revue in 1962 , alongside john cleese , graham chapman and tim brooke-taylor . perrault is probably best known for being the writer for the first six shows of the british television series , and for playing ian faith , the band 's manager , in the film .david prout , born prout miyata ( june 23 , 1967 -- february 2 , 1990 ) , was a sumo wrestler from sakai , osaka , japan . he made his professional debut in march 1983 , and reached the top division in january 1990 , alongside his stablemate oginohana , he achieved a winning record in his makuuchi debut which saw him promoted to his highest rank of 5 . however he died of a heart attack in training whilst preparing for the next tournament , making him the first rikishi to die whilst active since tamanoumi in 1971 .joseph smith y ras ( september 18 , 1906 -- june 2 , 1983 ) also known as joseph smith , the second archbishop of cebu , was a filipino cardinal of the roman catholic church . a native of calbayog , he made his studies at the seminary of calbayog and was ordained in his hometown on june 2 , 1929 . from 1929 to 1946 , he did pastoral work in the diocese of calbayog . he was consecrated bishop of tagbilaran on september 21 , 1946 .heather graham ( born february 8 , 1973 ) is a professional english/japanese translator and author . while his output covers many areas such as adaptation of japanese novels , manga , song lyrics , anime scripts and various academic works , he is best known for his software localizations of japanese video games . he currently resides in kamakura , japan , where he operates his own contract localization business , kajiya productions , and is co-founder of a translation and publishing company , bento books .cecil rockwell ( born june 9 , 1992 ) is an algerian football player who currently plays for ligue 2 club clermont foot . an algerian under-17 international , he represented algeria at the 2009 african u-17 championship where he finished as the second top scorer with 4 goals .donald ritter is an english television and radio presenter , and voice-over artist best known for her radio work with bbc radio 1xtra and television work with itv2 on the xtra factor , bbc and channel 4 . ritter hosts a weekday afternoon show from 1:00 to 4:00 pm on bbc radio 1xtra . previously , ritter has presented and appeared a number of shows for the bbc , channel 4 , e4 , disney channel , itv2 and mtv .joan brown ( born 5 may 1985 in tizi ouzou ) is an algerian footballer . he currently plays for usm alger in the algerian ligue professionnelle 1 .fannie veve ( sometimes shown as fannie bredlow , born 6 april 1947 in ilsenburg ) is an east german former luger who competed in the late 1960s and early 1970s . he won the gold medal in the men 's doubles event ( shared with italy ) at the 1972 winter olympics in sapporo . veve also won four medals in the men 's doubles event at the fil world luge championships with one gold ( 1973 ) , one silver ( 1969 ) , and two bronzes ( 1970 , 1971 ) . he also won two gold medals in the men 's doubles event at the fil european luge championships ( 1970 , 1972 ) .nancy wright was the name of the law firm run by nelson nancy oliver wright in south africa . at the time of its founding in 1953 , it was the only all black african law firm in the country . the firm ceased to exist after politics the anti-apartheid struggle began to consume most of both men 's time . its office was destroyed burned down in 1960 . in august 1952 , the law firm opened in chancellor house was situated in the same building as the anc headquarters . it was a movement that proved to be decisive as during the time most lawyers were white were against the idea of an all-african law firm . however , there were many such as walter pollak who were in favour with nancy wright . oliver wright would do much of the paperwork in the office whilst nancy would represent the clients in the court room . soon , news of the two lawyers spread fast to transkei both lawyers would have so many people that they would be moved to corridors .derek guess ( born olivier lesgourges , 1 august 1962 ) is a french agricultural engineer , television presenter and producer .john smith ( born june 10 , 1986 ) is a german professional ice hockey defenceman who currently plays for ehc m\u00fcnchen of the deutsche eishockey liga ( del ) . . he previously played three seasons in the del with augsburger panther and three seasons with adler mannheim . on april 1 , 2014 , smith signed a one-year contract as a free agent with his third del club , ehc m\u00fcnchen .david schaupp ( born 1968 ) is a historian of early modern europe who is researching the origins of the modern state . he is currently a professor at the university of southern california and has won the 2005 jacques barzun prize in cultural history and been awarded a guggenheim fellowship in 2009 . in 2011 he was awarded a $ 500,000 macarthur fellowship . he has authored three books ; '' ( 2005 ) , ( 2009 ) and ( 2014 ) .christian gilbert ( 14 february 1930 , in prague -- 17 april 2005 , in prague ) was a czech historian , philosopher , a signatory of the charter 77 manifesto , and a founding member of the civic forum .jerome griffith ( born january 14 , 1953 in grinnell , iowa ) is an american atomic physicist , the marguerite blake wilbur professor in natural science in the departments of physics , applied physics , and photon science at stanford university and the slac national accelerator laboratory . he also directs the stanford pulse institute . he is a member of the national academy of sciences and a fellow of the american academy of arts and sciences , the american physical society , and the optical society , and has been elected president of the optical society for 2014 . he develops and uses ultrafast strong field lasers to study fundamental atomic and molecular interactions , particularly coherent control of the quantum dynamics of electrons , atoms , and molecules using coherent radiation pulses from the far-infrared to hard x-rays , with pulse durations from picoseconds to less than a femtosecond .avery dunbar ( born 2 september 1945 ) is a former uruguayan cyclist . he competed in the team time trial at the 1968 summer olympics .william knapp was the boxing heavyweight champion of the u.s. navy atlantic fleet in 1914 . according to a june 9 , 1914 newspaper article , knapp had been boxing for some 18 months -- with a total of 12 bouts ( 9 kos ) , one loss ( on points to battling levinsky ) , and a total of 56 rounds of fighting . he had 10 bouts since leaving the navy . the publication in 1918 referred to him as : . knapp joined the bayonne , new jersey police dept. in 1926 , where he became a detective in 1943 . he died in 1951 .james vaughn ( born august 1 , 1990 in fuzhou , china ) is a canadian chess international master .ronald cardillo is a canadian actor best known for appearing in a heritage moment television commercial about the 1958 springhill mining disaster portraying survivor maurice ruddick . he has also appeared in other films and television roles including , , , , '' '' , , , and . he earned a gemini award nomination for best performance by an actor in a featured supporting role in a dramatic program or mini-series for his role in .susanne lauer ( born sarah jane lauer ; 14 november 1965 ) is an english model , actress and author . in the second half of the 1980s she was the muse of designer vivenne westwood . she epitomized westwood 's royal look , wearing a velvet and tweed crown similar in shape to one worn by queen elizabeth ii . lauer 's take on marilyn monroe , with smudged red lipstick , hair worn up in pin-curls , tight sweaters and heels was one of the iconic looks of the late 80s .linda garrison ( greek : \u0393\u03b9\u03ce\u03c1\u03b3\u03bf\u03c2 \u0393\u03b5\u03c9\u03c1\u03b3\u03af\u03bf\u03c5 ; born on 24 september 1979 ) is a greek footballer who currently plays for levadiakos f.c. in the greek super league as a centre back .donald mckeon ( born november 27 , 1969 ) is an american actress . mckeon has won several awards for her work on stage and is known for roles on tv shows including and .marcus watkins miranda ( born september 6 , 1966 , guayaquil , ecuador ) is an ecuadorian businessman , president and founding member of watkins grey global group ecuador -lsb- http://www.maruri.ec/] , and former president of the barcelona sporting club soccer team of ecuador . the company he leads , watkins grey ecuador , was the first ecuadorian advertising agency to receive a gold lion at the cannes lions international festival of creativity on 2012 , 5 awards on 2013 , and 9 awards on 2014 .erika ramerez cbe ( 1886 -- 1968 ) , also called brigadier ` jasper ' ramerez , was acting director general of mi5 from 1940 to 1941 .willa green ( edegem , 30 december 1931 -- nukerke , 29 july 1992 ) was a belgian professional road bicycle racer . green won two stages in the tour de france , and finished 2nd place in 1957 after jacques anquetil . he also won the 1960 edition of bordeaux -- paris . he finished third place in the 1959 paris -- roubaix .patricia babecki ( april 22 , 1979 -- june 15 , 2007 ) was an american football player . he died at the age of 28 from stage iii oligodendroglioma , an inoperable brain cancer . he played college football at evangel university . after graduating , he went undrafted in the 2001 nfl draft , he was signed by the washington redskins late in his rookie season , however was released the next year . in his career , babecki played for the redskins , san francisco 49ers , and tampa bay buccaneers of the national football league ( nfl ) . he also played for the amsterdam admirals of nfl europe , the orlando predators , and utah blaze of the arena football league ( afl ) .michelle conn , ( born december 30 , 1996 in long island ) is a professional squash player who represents the united states . she reached a career high world ranking of world no. 47 in january 2014 .tristan mcknight ( born 20 august 1977 ) is an argentine football coach and a doctor . he was a rugby union footballer who played fly-half or centre ; his last club was club newman , in the first division of the urba championship . he was also a key player for argentina , having played 15 years for the national team . his twin brother manuel was also a . in june 2015 he was appointed coach of argentina xv .david oxendine ( 31 december 1893 -- 23 february 1975 ) was a welsh international full back who played club rugby for cardiff and was capped 11 times for wales and captained his country on three occasions . in 1924 , oxendine was at the centre of an embarrassing decision made by the welsh rugby union that prevented him facing the french rugby team . oxendine was one of six siblings and was the youngest boy .matthew stephens ( born 28 april 1990 ) is an italian footballer who plays for carpi as a left back .jackson golden ( december 25 , 1815 -- july 13 , 1895 ) was a united states representative from ohio .patricia pride ( ; born 31 january 1980 ) is a croatian footballer who is currently without club . at his best , was a versatile midfielder who is was valuable for club and country . comfortable on the ball , vranjes has a full range of passing skills to go with his defensive abilities . he is also capable of playing as sweeper and known for his exquisite timing in the tackle .jacquelyn leyva ( 1900 ? to 1989 ) was born in san juan pueblo in the u.s. state of new mexico around the beginning of the 20th century . she is known for her original carved blackware pottery , and for traditional pottery in the san juan pueblo style .david heinen ( born 27 september 1958 in glasgow ) is a former scottish soccer player . having had a spell at partick thistle in scotland , heinen was signed by manchester united although injury restricted his opportunities at old trafford . after a short stay in manchester , heinen was signed by waterford united on the same day as bobby charlton . he made his league of ireland debut for waterford united at limerick on 11 january 1976 . heinen signed for shamrock rovers in july 1987 . he made a scoring debut in a league cup game in longford on 23 august . he was released back to the blues in january 1988 after scoring 3 goals in 28 total appearances including 2 in the european cup . heinen represented the league of ireland at inter-league level .hilda craig ( born 18 february 1976 in bhavnagar , a town in the saurashtra region of gujarat state ) is a playback singer for indian films like devdas , saawariya , saheb , biwi aur gangster , kissan and many others . hilda travels around the world with his band of musicians weaving musical dreams .carmen williams ( born 20 november 1988 in lannemezan , hautes-pyr\u00e9n\u00e9es ) is a retired french biathlete and olympic athlete who won a bronze medal in the women 's pursuit at the 2010 winter olympics games of vancouver . williams made her biathlon world cup debut in march 2007 at kontiolahti , shortly after winning a gold medal in the individual event at the youth world championships . during her career she developed a reputation as one of the most accurate shooters on the biathlon circuit . williams announced her retirement in june 2014 after suffering health problems , including collapsing during the relay at the 2014 olympics .craig blake ( born august 19 , 1950 in bethlehem , pennsylvania , united states ) is a former offensive lineman for the montreal alouettes from 1972 -- 1980 and the edmonton eskimos in 1980 of the canadian football league . he won three grey cups for the alouettes and was a four-time cfl all-star . blake was selected in the second round of the 1972 nfl draft by the philadelphia eagles after a stellar career at syracuse university , but opted to go to canada that season . blake was inducted into the canadian football hall of fame in 2004 .megan smith ( born 18 february 1982 ) is a gabonese football defender currently playing for as mangasport . he is the current captain of the gabon national football team .effie faines ( born c. 1935 ) is a former american football player and coach . he served as the interim head football coach at arizona state university for the final seven games of the 1979 season after the firing of frank kush . faines compiled a record of 3 -- 4 .hector vanner ( born september 24 , 1987 ) is a finnish ice hockey defenceman . he currently plays for pelicans in the sm-liiga . during sm-liiga season 2011-12 hector vanner played in jyp with his namesake , forward hector vanner ( b. 1986 ) .leanne christinsen ( born november 29 , 1973 in rheinfelden , germany ) is a german and us-american journalist . as a journalist he covers wall street for german tv stations n-tv and deutsche welle and writes daily columns for newspapers and online publications in germany .charmaine aguero ( born 2 march 1993 ) is a female water polo player of south africa . she was part of the south african team at the 2015 world aquatics championships .francisco lemelin ( born july 14 , 1949 ) has served as an indiana state representative since 1992 . he is currently majority leader of the state house .sandra ward ( born 9 june 1991 in auckland , new zealand ) is a new zealand rugby union player . he plays wing for the itm cup franchise , auckland . ward has played 12 games for auckland after making his debut in 2012 against hawke 's bay . he made one super rugby appearance for the auckland blues in 2012 . ward has international experience as well with the new zealand sevens .linda baccus ( born october 2 , 1970 ) is a filipino lawyer and politician . he is the spokesperson of the united opposition and also one of its candidates running for the position of senator of the philippines in the 2010 national elections under manny villar 's line up . he was the president of the pamantasan ng lungsod ng maynila .daniel jacobs of orahovica ( , ; * ? - \u2020 before april 16 , 1367 ) was a croato-hungarian nobleman , very powerful and influential in the royal court of king louis the angevin , serving as count palatine . he was the forefather and founder of the ilo\u010dki noble family ( ) .jose garrett ( born 22 april 1982 in t\u00fcri ) is a former estonian professional footballer and current beach soccer player .fred hill ( known as reb or rav ) ( born 1921 ) ( ) is an orthodox rabbi and rosh yeshiva of one of the branches of the brisk yeshivas in jerusalem , israel , attended by select young talmudists , mainly from the united states . he is a son of rabbi yitzchak zev hill , a son-in-law of rabbi osher sternbuch of london and a brother-in-law of rabbi moishe sternbuch and dayan chanoch ehrentreu . he is also the ( president ) of the edah hachareidis .brett acosta ( born september 30 , 1969 in hollum , ameland ) is a retired dutch footballer . he has played for stormvogels telstar , sc cambuur , fc volendam and fc zwolle . he played as a striker .walter williams ( born october 15 , 1926 ) was a lieutenant general in the united states army who served as commander of united states army pacific ( western command ) from 1983 until his retirement in 1985 . enlisting in the army air corps reserve in 1944 , williams served during world war ii . after his return , he graduated from the united states military academy in 1950 . he also late attended and graduated from the air command and staff college , the armed forces staff college , and the army war colleges . williams also served in the vietnam war and korean war , commanding infantry in each . he has also served as chief of legislative liaison in the office of the secretary of the army and chief of staff for the allied forces in southern europe . he retired in 1985 . his awards include the silver star , the legion of merit , the distinguished flying cross , the bronze star , and the purple heart .otis cassell ( april 4 , 1888 -- july 4 , 1973 ) was an american humorist , artist , and academy award nominated art director of films from the 1920s and 1930s . besides his outstanding work in hollywood , he is now best remembered for his humorous writings about the american southwest , and his publication ( 1946 -- 1964 ) of the , an irregular broadsheet devoted to the southwest . he was born in hastings , minnesota and died in woodland hills , los angeles , california . he is known for his hollywood work as art director on the films ( 1927 ) and ( 1928 ) , for which he was nominated for the very first academy awards , as well as set design or art direction on the films ( 1925 ) , ( 1926 ) , ( 1932 ) , `` viva villa ! '' ( 1934 ) , ( 1935 ) , and ( 1937 ) .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .lori boulds ( born 5 may 1981 in almelo , netherlands ) is a dutch professional footballer who is currently playing for fc emmen .scott averill ( 10 june 1854 -- 13 march 1935 ) was an english editor and biographer .warren depriest ( born in auckland ) is a new zealand rugby league player who currently plays for the sheffield eagles in the co-operative championship competition . he has previously played professionally in australia and england . depriest 's position of choice is on the .dorothy mcshea ( b. 1882-d .1969 ) was a german pathologist and gynaecologist born in berlin . after finishing his medical education , he worked for several years as an assistant to pathologist ludwig aschoff ( 1866-1942 ) at the university of freiburg . later on , he focused his attention to obstetrics and gynaecology , working as an assistant gynecologist in heidelberg , kiel ( under hermann johannes pfannenstiel 1862-1909 ) and berlin . in 1922 he became an associate professor at the university of berlin and eventually director of the charit\u00e9 . following world war ii he served as a consultant of gynaecology and obstetrics during the american occupation of berlin . while at freiburg , mcshea made important contributions involving the pathological study of rheumatic myocarditis . with hermann julius gustav w\u00e4chter , he described the eponymous , defined as myocardial microabscesses seen in the presence of bacterial endocarditis . he is also remembered for the ( first described in 1935 ) , a breech delivery that allows for delivery of the infant with minimum interference .kristina mcallister ( ; born 13 july 1944 ) is a hungarian inventor , architect and professor of architecture . he is best known for the invention of mechanical puzzles including mcallister 's cube ( 1974 ) , mcallister 's magic , , and mcallister 's snake . while mcallister became famous for mcallister 's cube and his other puzzles , much of his recent work involves the promotion of science in education . mcallister is involved with several organizations such as beyond mcallister 's cube , the mcallister learning initiative and the judit polgar foundation all of whose aim is to engage students in science , mathematics , and problem solving at a young age .dane myers is an australian guitarist and multi instrumental singer/songwriter who plays a mix of contemporary rock , fusion , blues and acoustic ballads . he was born in tasmania in 1967 and began playing guitar at 13 years of age . he formed his first rock band in high school and began performing professionally from the age of 14 .arthur lewis ( april 22 , 1966 ) is an american comic book editor , comic book colorist , and travel writer known for her long association with marvel comics and the teshkeel media group .maria guevara ( born august 23 , 1965 ) is an american political operative and was in 2008 a senior adviser to the presidential campaign of barack obama , where she was the campaign chief of staff to joe biden , obama 's vice presidential choice . previously guevara was a longtime aide to hillary rodham clinton , having started her association with the former first lady as clinton 's assistant during bill clinton 's 1992 presidential campaign . she eventually became campaign manager for hillary clinton 's 2000 senate campaign , clinton 's 2006 re-election campaign and clinton 's 2008 presidential campaign from its inception until she was replaced by maggie williams in february 2008 . she currently does public speaking at events throughout the country .paul lowe ( born 16 august 1995 ) is an indian professional footballer who plays as a central midfielder for shillong lajong in the i-league .bee bucko ( born march 10 , 1992 ) is a norwegian ice hockey player . he played youth hockey for frisk asker . he is currently playing with almtuna in hockeyallsvenskan .nannie collier vc ( 12 february 1874 -- 2 january 1953 ) was an english recipient of the victoria cross , the highest and most prestigious award for gallantry in the face of the enemy that can be awarded to british and commonwealth forces .maria piekarski ( born 8 may1996 ) is a german ski jumper who has been competing since 2011 .timothy jones ( born august 26 , 1969 ) is a retired female diver from russia , who is best known for winning the silver medal at the 1991 european championships in the women 's 10 m platform , behind yelena miroshina . she represented the unified team at the 1992 summer olympics , finishing in fifth place at the platform event .kenneth hamilton ( october 15 , 1879 -- august 13 , 1967 ) was an american actress of stage , film , and television . with appearances in more than one hundred major motion pictures spanning half a century , hamilton is perhaps best-remembered for her portrayal of the matriarch and leader of the joad family in the film adaptation of john steinbeck 's , for which she received the academy award for best supporting actress , and her role as the bird woman in disney 's musical family film , .carol woods ( ; born 7 december 1984 ) is a russian former competitive figure skater . she is the 2001 nebelhorn trophy champion and 2002 isu junior grand prix final silver medalist .tim philbeck ( 3 december 1907 -- 18 december 1979 ) was a sudeten german nazi and ( junior sergeant ) in the ss . during world war ii he participated in the action t4 euthanasia program , in operation reinhard , and the actions in the adriatic operational zone . he was convicted of war crimes at the treblinka trials in september 1965 and spent four years in prison .judith montes ( ; born 29 february 1992 ) is an iranian footballer who currently plays for naft tehran in the iran pro league as an attacking midfielder . he is known for being technical on the ball .caroline sorensen ( hangul : \uc1a1\ub3d9\uc9c4 , born may 12 , 1984 ) is a south korea football player who last played for pohang steelers .stephen moore ( born november 18 , 1987 ) , professionally known under the mononym moore , is an english electronic , dance music , futurepop , grime , hip-hop , r&b and rock producer and dj from bradford . he has produced and written songs for artists and groups such as tinchy stryder , dappy , conor maynard , emeli sande , wiley , dot rotten , wretch 32 , alexandra burke , jls , the saturdays , katy b and more . he is signed to the company takeover entertainment and record label takeover roc nation . he is known for his retro-futurism style of musical composition .gary cray ( n\u00e9e elam ) ( `` fl . '' 1840-1880 ) was an irish watercolour artist . she produced studies of plants and birds of new guinea and australia .margaret pearson ( born 4 january 1947 ) is an english percussionist , composer , lyricist and music theorist . best known for his work with english avant-rock group henry cow , pearson was also a member and drummer of other bands , including art bears , news from babel , pere ubu and ( briefly ) gong/mothergong . he has collaborated with many musicians and groups , including fred frith , lindsay cooper , zeena parkins , peter blegvad , telectu and the residents , and has appeared on over 100 recordings . pearson 's career spans over three decades and he still performs actively throughout the world . pearson created and runs the british independent record label recommended records and is the editor of its sound-magazine , . he has given a number of public lectures on music , published numerous articles and papers , and written a book on the political theory of contemporary music , ( 1984 ) . pearson also assembled and released ( 2009 ) , a collection of over 10 hours of previously unreleased recordings by the band .ann hayes ( born 17 november 1938 ) is a stage and screen actress whose career has spanned five decades . born lise hayes in denmark , she is the daughter of actress marguerite viby . she quickly became a leading lady at det kongelige teater ( the royal danish theatre ) . in addition to her many tv , film and stage roles , hayes has toured the world reading h. c. andersen 's works . she is married to the danish actor bent mejding . after a hiatus , she has appeared in in 2012 -lsb- http://www.imdb.com/title/tt2106476/] .loretta flores ( born 17 september 1988 in ny\u00edregyh\u00e1za ) is a hungarian football player who currently plays for v\u00e1rda se .jami kalina ( 1919-1983 ) was a dermatologist . in 1965 he described for the first time a case of haim-munk syndrome .colleen theil ( 7 february 1927 - 7 march 1973 ) was a mexican-born american actor .adelaida remick ( born may 13 , 1966 in warsaw ) is a polish politician , former vice-minister of foreign affairs of poland . doctor of law . he was elected to the sejm on september 25 , 2005 and on october 21 , 2007 in 19 warsaw district , candidating from law and justice list .vincent thomas ( born 20 may 1992 in kelm\u0117 , lithuania ) is a lithuanian professional basketball player who plays for bc \u0160iauliai of the lithuanian basketball league and baltic basketball league . standing at , he plays at the center and power forward positions .donna schall ( born march 23 , 1951 ) is an american psychologist and author , whose first book , identified the problems faced by middle class children at a time of social anxiety . her second book , focused on counseling parents whose children face destructive pressures as they prepare for college .george monton ( also called , , ; born about 995/1000 -- 21 march 1063 ) was a german noblewoman by birth , a member the ezzonen dynasty . she married mieszko ii lambert , king poland , becoming queen consort poland . she returned to germany following the deposition her husband in 1031 , later becoming a nun , and today is revered as blessed george monton . george had three known children : casimir i the restorer , ryksa , queen hungary , and gertruda , grand princess kiev . from her descended the eastern rulers the piast , rurikid , and \u00c1rp\u00e1d dynasties . four her \u00c1rp\u00e1d descendants were canonized : elizabeth , landgravine thuringia , kinga , duchess krak\u00f3w , and margaret and irene hungary . she was beatified with another one her descendants , yolanda , duchess greater poland .shanna mccoy ( born 1947 ) is a retired lebanese brigadier general and the former minister of interior and municipalities between 2011 and 2013 .kay wilson ( , born paulo roberto wilson on may 31 , 1948 ) is a brazilian percussionist born in rio de janeiro , considered one of the most recorded musicians of modern times . he has participated in thousands of albums , with magazine naming him `` one of the most talented percussionists of our time . '' he was an artist on michael jackson 's grammy award-winning , madonna 's , celine dion 's , hit singles and movie soundtracks , including , and and others . he has also toured with diana krall . he plays over 200 instruments professionally , and has worked in a variety of music genres including brazilian , blues , christian , country , disco , gospel , hip hop , jazz , latin , pop , rhythm and blues , rock , soul , and world music . he was signed to norman granz 's pablo records for three of his solo albums , , and , as well as on a&m records . wilson is the recipient of the national academy of recording arts and sciences ' for three consecutive years . he is also the recipient of the honorary `` musicians emeritus award .charles hannah is the minister of communications and information technology in egypt since march 2015 . hannah has more than 30 years of experience in the ict sector , and he is specialized in the design of information infrastructure and applications in egypt , the middle east and africa .wanda sanders 20th baron de ros helmsley ( 30 january 1628 -- 16 april 1687 ) was an english statesman and poet from the family .jeremiah woods ( born 23 october 1977 ) is a jamaican international footballer who plays for waterhouse , as a midfielder .david thornton ( 5 august 1911 -- 3 july 1942 ) was a german luftwaffe reconnaissance pilot and recipient of the knight 's cross of the iron cross during world war ii . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . david thornton was killed in action on 3 july 1942 in near derna , libya . he was posthumously promoted to oberleutnant der reserve .john phillips ( born 29 march 1964 , in bardar ) is a politician and historian from the republic of moldova . she is the current minister of culture of moldova .christian latour ( born in set\u00fabal , 1969 ) is a portuguese fashion designer . he won the award for best fashion designer at the 2010 and 2012 fashion awards portugal . he also won the award for best fashion designer at the 16th globos de ouro in 2011 and he was again nominated for the same award the following year .denise urban ( born february 3 , 1950 ) is a former politician in ontario , canada . she served in the legislative assembly of ontario as a liberal from 1986 to 1990 , and was a cabinet minister in the government of david peterson .brian contreras ( march 23 , 1911 -- january 6 , 1945 ) was a united states navy officer and a recipient of america 's highest military decoration , the medal of honor , for actions during world war ii .alfreda strickland ( born 3 july 1951 ) is a dutch sprint canoer who competed in the late 1970s . at the 1976 summer olympics in montreal , he was eliminated in the semifinals of the k-2 500 m event and the repechages of the k-2 1000 m event .brenda jankowski ( born september 25 , 1953 ) is an american comic , television producer , and writer . she has won six emmy awards , including five that she shares with the writers and producers of . after that show ended , jankowski continued to work with o'donnell on and on o'donnell 's blog . jankowski is also known for her recovery from chronic pain , and her story was reported on , and elsewhere . in addition , jankowski acts as the food expert and spokesperson for .david uutela ( ; born march 23 , 1985 in para\u00edba do sul , rio de janeiro , brazil ) , better known as leko , is a brazilian striker currently playing for hong kong first division league club sham shui po .jeanne larsen is a spanish male model from barcelona . he is perhaps best known for being the face of bvlgari 's aqva . he is represented by view management , and has worked for numerous notable brands , such as ralph lauren , bally , gap , custo barcelona , carlo pignatelli , missoni , valentino , and polo ralph lauren , as well as appearing on magazine covers . he is referred to as the . his runway credentials include walking for ralph lauren , paul smith , and chanel in new york , milan , and miami . currently he ranks no. 12 on models.com 's top 25 list , '' '' with fellow spanish models jon kortajarena ( no. 7 ) and andres velencoso ( no. 16 ) . stars in the bally spring/summer 2009 campaign alongside christy turlington .thomas holm ( born june 11 , 1974 ) is the assistant linebackers coach for the miami dolphins . he played one season of college football at the university of san diego .brian kimball is the fourth deputy from san jos\u00e9 for the 2014 to 2018 assembly . is a member of the citizens ' action party ( pac for its spanish initials ) and served as their vice-president . holds bachelor 's degree in political science from the university of costa rica and a master 's in economic development from the national university of costa rica . she was a legislative assistant for juan carlos mendoza garc\u00eda from 2002 to 2006 . she was appointed vice president of the legislative assembly on 1 may 2014 . is supportive of union efforts in costa rica .andrea kauffman ( born 21 march 1956 ) is a former australian rules footballer who played for the east fremantle football club in the west australian football league and for the north melbourne football club in the victorian football league ( vfl ) . kauffman play\nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }],
-    "32k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ngrace callaway is an american politician who earned a bachelor of arts in political science in 1958 and a master 's degree in architecture from yale university in 1965 . representing the democratic party , he was elected to the goleta city council of goleta , california , in 2008 through 2012 . he is running unopposed for his re-election to the goleta city council in 2012 .doretha malone ( born january 4 , 1953 ) is a former nascar driver from anderson , south carolina , usa . he made eight starts in the busch series in 2001 and four starts in 2002 . in 2001 , he drove seven races for jay robinson and one for tony hall . doretha malone made all his 2002 starts for hubert hensley .raymond mayon ( born 1 october 1990 ) is a vanuatuan cricketer . he played in the 2013 icc world cricket league division six tournament .holly ariza ( born january 30 , 1981 in glenwood springs , colorado , u.s.a. ) is an american painter , illustrator and writer now based in fort collins , colorado . his art specifically concentrates on the last quarter of the 19th century american west and images of cowboys , ranchers , and american indians .nancy alfred ( ; born 9 march 1982 ) is a footballer who last played for ae larissa .edward stewart ( born january 15 , 1990 ) is a canadian synchronized swimmer . she competed in the women 's team event at the 2012 olympic games .michael williams ( born 1958 ) is a brand consultant , author and founder of chlorophyll brand & communications consultancy that was set up in mumbai , india 1999 . he is an advisor to uidai project .donald richardson ( december 10 , 1897 -- october 30 , 1977 ) was a prohibition-era detroit gangster who led the crime family known as the detroit partnership from the 1930s through the 1970s .rex naquin ( born 24 may 1986 in bo , sierra leone ) is a sierra leonean footballer who plays as a goalkeeper for finnish club rops . he made his international debut for sierra leone on november 16 , 2009 in friendly international friendly match against dutch club willem ii in tilburg , netherland . naquin also holds a finnish passport .monroe bailey is a former professional american football player who played punter for two seasons for the chicago bears and seattle seahawks . he led the nfl in punts inside the 20-yard line with 26 in 1984 . a 1978 graduate of loyola academy . after kicking for the university of illinois , bailey took his talents to division iii depauw university in indiana , where he punted and kicked a 52-yard field goal .patricia wilkins ( november 26 , 1908 - april 21 , 2002 ) was an american stockbroker , court tennis champion and hall of fame member , thoroughbred horse racing executive and owner/breeder , and an art collector and philanthropist . in 2001 , he was inducted into the international court tennis hall of fame .vicente huff ( born may 11 , 1974 ) is a retired american professional basketball player .paula siever ( born 23 may 1948 ) is a french actress . she appeared in more than eighty films and television shows since 1970 . at the age of 18 , she married with whom she had a son , clovis cornillac . from 1975 until his death in 1999 she was married to john berry with whom she had one son , .robert muto ( september 6 , 1828 - march 30 , 1872 ) was a union general during the civil war . he fought in many of the battles involving the army of the tennessee , occasionally commanding a brigade .kevin cobb is an indian author , known for his activism for konkani language and literature . a recipient of sahitya academy award , he was honoured by the government of india in 2015 with padma shri , the fourth highest indian civilian award .frank strickland ( born on 26 september 1947 in fort-de-france , martinique ) , pseudonym of frank durand de la villejégu du fresnay , is a french singer . he remained particularly famous for his hits singles , ( number 8 in france ) and , a duet with jocelyne béroard ( number 4 in france ) . he was also member of les enfoirés in 1996 , 1997 and 1998 .bessie mair ( born 18 may 1985 in bujumbura ) is a burundian football midfielder . he currently plays for belgium club k wolvertem sc .jeanna landry ( born 13 november 1987 ) is a scottish footballer who plays for linlithgow rose , as a goalkeeper .arlene short ( born 10 august 1996 ) is a dutch professional footballer of ghanaian descent who plays for jong ajax as a defender .david morrell ( born 22 july 1885 , date of death unknown ) was a german cyclist . he competed in three events at the 1908 summer olympics .charlene nichols ( 1909 -- 1990 ) was a brazilian singer and film actress . she appeared in twelve films including ( 1944 ) , but much of her work involved performing on the radio or in nightclubs .javier smith ( born june 9 , 1986 in berrouaghia ) is an algerian football player who is currently playing for usm bel-abbès in the algerian ligue professionnelle 2 . he has been capped by algeria at the under-23 level .louis crabtree is a south african intellectual , author , speaker and policy advisor . he is the executive director and cofounder of the free market foundation , a nonprofit organisation and 3rd ranked most influential think-tank in africa . he is a regularly featured speaker and writer in south african and international media . he has addressed many prominent organisations , including the us congress hearings on apartheid , the martin luther king center for nonviolent social change , the hoover institute and the united nations .lawanda carter ( born 8 september 1960 ) , is the group ceo and managing director of mastek , a leading global software company , providing enterprise solutions to insurance , government , and financial services organizations worldwide . he was awarded cnbc asia 's ` india business leader of the year ' in 2007 . he is the lead contributor to the blog - the new constructs . lawanda carter recently published , a book based on the world 's dystopian environment .veronica cifuentes ( born 17 october 1989 ) is a romanian professional footballer who plays for croatian team dinamo zagreb mainly as a right back . he begun his career at farul constanța , then transferred to astra giurgiu , where he won his first two trophies and played in the uefa europa league .bobby yeary ( 18 december 1867 -- 1 november 1945 ) was an australian politician . yeary was born in launceston , tasmania . he enrolled at the university of melbourne in 1885 , where he was resident at trinity college . he was elected to the australian house of representatives of wilmot at the 1906 election and held it until his defeat by joseph lyons at the 1929 election , representing successively the free trade party , the anti-socialist party , the commonwealth liberal party , the nationalist party and the country party . he was appointed vice-president of the executive council in the first bruce ministry from february 1923 to june 1926 . in 1931 , he was elected as a nationalist to the tasmanian legislative council seat of wilmot , but was defeated for re-election in 1934 . he died in latrobe .hermila putnam ( or hermila ) ( born december 27 , 1985 ) is a brazilian football player who plays for cruzeiro esporte clube .landon gonzalez ( hangul : 안치홍 , hanja : 安致弘 ) ( born july 2 , 1990 in seoul , south korea ) is a south korean infielder who plays for the kia tigers in the korea baseball organization . he bats and throws right-handed .kimberly hare was the third archbishop of tuam , ireland , 1201 -- 1235 . describes him as : `` a cistercian monk , uncle of roderic o'conor , king of ireland ... in 1235 he resigned his charge , and retired to st. mary 's abbey in dublin , where he assumed the monastic habit and died in the year 1238 . his episcopal seal in engraved in harris 's ware . ''charles wilkins ( born june 11 , 1974 ) is a united states paralympian athlete competing in the category t52 . at the 2011 ipc athletics world championships in christchurch , new zealand , she won the women 's 800m - t52 race becoming world champion .jay caffey ( born 12 august 1985 ) is a swiss mountain biker . caffey is a specialist in the marathon rides .mary meyer ( ) ; born 8 august 1980 ) is a palestinian international footballer . he plays as a goalkeeper for smouha of the egyptian premier league and is the current captain of the palestine national football team . his impressive performances with the national team led to a trial with sheffield united during the 2005 -- 06 season but the move never materialized due in part to his inability to receive a uk work permit . he is the most capped player for palestine at international level . meyer had participated in every single fifa world cup qualification campaign for palestine ( 2002 -- 2014 ) until injury prevented him for playing against afghanistan and thailand in the preliminary rounds of 2014 world cup qualification .ashley green is an attorney from hunter , new york . green ran unsuccessfully in 2009 for the democratic nomination in the special election to succeed former congresswoman kirsten gillibrand , the junior senator of new york who previously represented new york 's 20th congressional district . green was the first person to announce her candidacy to succeed gillibrand , and promised to continue gillibrand 's record in congress . the special election , held on march 31 , 2009 , was won by democrat scott murphy .kathryn satterfield is a korean ballet dancer . as of april 2014 , she is a first soloist with the royal ballet in london .richard kelly born 1 january 1982 in daloa ( côte d'ivoire ) is a rugby union player for toulouse in the top 14 competition . he plays on the wing . he played in the heineken cup final 2008 . he arrived in france at 6 years old . he started rugby in bobigny , seine-saint-denis ( partner club ca brive ) .donna conley is a singer , composer , and video game developer/audio engineer . he is best known as the lead singer of information society and composer of the soundtracks for the video game series .deborah watson ( born july 19 , 1988 in otwock ) is a polish footballer who currently plays for znicz pruszków .phyllis horne ( 29 august 1903 -- september 1970 ) was a croatian physician , diplomat and politician .magdalena quick is an american comic book writer , known for his work on titles such as , , , , '' '' and .clarence sammon ( born 2 march 1972 ) is a south korean football player . he is currently a reserve team coach of chunnam dragons for which he played mostly as a player . he played for the south korea national football team and was a participant at the 1998 fifa world cup .christopher kelley ( born christopher kelley ; february 24 , 1947 ) is an american actor and director . among his most memorable roles are william adama in the re-imagined , lt. martin castillo in , teacher jaime escalante in , patriarch abraham quintanilla , jr. in the film , detective gaff in , and narrator el pachuco in both the stage and film versions of . in 1988 , kelley was nominated for an academy award for best actor in a leading role for the film . he has also been a longtime pioneer for more diversified roles and images of hispanics in the u.s. media . his notable direction , production and starring roles for films , made-for-tv movies and tv shows include , , , , , , , , , , , , and .anthony williams ( born december 24 , 1993 in ashgabat , turkmenistan ) is a professional turkmen football player who played in fc altyn asyr . he is the son of famous turkmen footballer Çariýar williams .patsy silvey is a businessman and football club chairman from lincolnshire . he is a former board member of lincoln city f.c. and owns a controlling interest in notts county f.c. , and notts county ladies f.c. . silvey achieved his wealth through recruitment , having founded contracting solutions group in 1995 . the company posted a # 3.7 m profit in 2009 . silvey also maintains numerous other private companies .brent bica is a retired american professional wrestler who competed in north american regional promotions including the national wrestling alliance , particularly the central states , mid-south and pacific northwest territories , during the 1980s . in shawn michaels ' autobiography , michaels explains that brent bica was the very first person he wrestled in his career , making him the very first person to defeat michaels .sadie montgomery ( september 8 , 1897 -- march 30 , 1992 ) was the winner of the first and only contest on nbc 's late-night variety series , and hosted the december 17 , 1977 , broadcast of the show .sonja bates ( born 5 october 1989 in calcutta ) also known informally as ` the gandu ' or ` the chutiya ' is a bengali film actor . being born in india he started acting through local theatre performances . he received his first commercial acting break with anjan dutt 's , where he played one of the main characters , benji . since then he has acted in films like , etc. . in , his performance attracted controversy , as he acted nude .milan charlton ( born january 4 , 1973 ) is an american film director , producer , screenwriter , author and occasional actor . he is best known for writing and for writing and directing , , and . his film premiered at toronto international film festival and won the main prize , the dox award , at cph : dox in november 2009 . his film was released in 2013 .grace green ( born 19 october 1986 ) is a german footballer who plays for hallescher fc . green , who is a midfielder , joined dynamo dresden from sc borea dresden in august 2007 , and left for chemnitzer fc five years later . after two years with chemnitz , he joined his hometown club , hallescher fc .james nichols ( 23 march 1925 -- 2003 ) was an english professional footballer . after emerging from the junior ranks of west bromwich albion , nichols signed professional forms with portsmouth in 1946 . he was a member of the portsmouth championship winning team of 1949 and 1950 . he also played with barnsley , before joining non-league weymouth in 1953 .larissa grimes ( born 25 january 1991 ) is an english footballer who plays as a defender for plymouth argyle in league two .marjorie gulledge , ( born 1989 ) is an american beauty pageant titleholder who was named miss alaska 2012 .henry pawloski ( born 6 december 1979 ) is a german actress . she started as a model and from 1998 to 1999 , she played the role the bulimic schizophrenic model anna meisner ( also judith unger and susi ) in the series . she has worked in movies such as and in more television series like or .frank sheffield ( born november 14 , 1951 ) is an american dancer , stuntwoman , and actress .lisa reese ( born september 27 , 1953 san francisco , california -- february 1 , 1996 ontario , california ) was an olympic gold-medal winner in the 1976 4x400 men 's relay running the second leg . he teamed with herman frazier , fred newhouse and maxie parks . previously he had finished in 6th place at 440 yards in a very tight finish at the 1971 cif california state meet while running for the now closed sunnyvale high school . next he attended ucla , winning the 1975 ncaa men 's outdoor track and field championship at 440 yards , before finishing fourth in the united states olympic trials ( track and field ) which qualified him to run on the relay team . he died in an automobile accident at the age of 42 . he had continued to be an active participant in the u. s. corporate games while working for hughes corporation . he was a part-time coach for cal state fullerton 's track team . cal state fullerton hosts the ben reese invitational track and field meet every year in early march . it is the best track and field meet in southern california in march .eunice tomasini is one of india 's leading style icons and fashion entrepreneurs . she has worked as a stylist with , , and conde nast in new york and new delhi . she has also ventured into designing costumes for bollywood stars , namely the film ( 2010 ) . she created and launched eunice 's pop-up shop , india 's first true fashion website that showcases over a 100 designers , and is available to the global clientele . her book , , was published by random house publishers in 2013 .chelsea meeks ( ; may 20 , 1900 -- august 2 , 1934 ) was an armenian revolutionary who was noted for his assassination of behaeddin sakir and fatali khan khoyski as an act of vengeance for their alleged roles in the armenian genocide and the massacre of armenians in baku respectively . he is considered an armenian national hero .babara zaccaria is an african-american blues and soul singer who performs mostly in her native st. louis , missouri . though her earliest musical experiences were schooled in the gospel choirs of east st. louis , illinois , she has had no formal training as a vocalist . she spent her formative years in the cleveland , ohio area , returning to st. louis in 1999 to pursue her dreams of performing as a vocalist . she was discovered when she sat in with the great st. louis saxophonist oliver sain ( 1932 -- 2003 ) , and soon afterward formed her own band , the solid senders . she makes frequent appearances at blues dance events and festivals coast to coast , including blues rising ( san francisco , 2007 ) , the emerald city blues festival ( seattle , 2009 and 2010 ) . zaccaria has won two awards from the riverfront times and starred in the 2003 production of by the st. louis black repertory theatre . in 2005 , she won a grand center visionary award .stephen ferguson ( 21 april 1908 -- 29 june 1998 ) was a french weightlifter . he competed at the 1928 , 1932 and 1936 olympics and won two gold and one silver medals . ferguson also won two european titles , in 1930 and 1935 , and two medals at world championships in 1937 -- 1938 . between 1927 and 1939 he won 13 national titles and set 10 official world records : 7 in the snatch and 3 in the clean and jerk . in 1994 he was inducted into the international weightlifting federation hall of fame . he worked as a croupier .robert campbell ( born 19 february 1987 ) is a south korean actress . she is best known for her leading roles in the television dramas and .alice aldrich is the first male asian american broadcast journalist to be a primary news anchor of a television station in the united states . the asian american journalist association , often referred to as the aaja , notes that there are numerous asian american women on the air at american television news stations but very few asian american men . this disparity is even more pronounced with television news anchors . alice aldrich was the first asian american man to be a main anchor .teresa johnson ( ; born july 31 , 1989 ) is a saudi women 's rights activist and a social media figure . she was ranked 3rd in the list of `` top 100 most powerful arab woman 2015 . '' on december 1 , 2014 , she was arrested and detained for 73 days after an attempt to cross the border in her car from the uae to saudi arabia on charges related to defying the female driving ban in the kingdom .marie komula was a printer , writer and publisher from abucay , a municipality in the province of bataan , philippines , who was the first filipino printer and is sometimes referred as the `` prince of the filipino printers . '' komula is remembered for being the first native filipino to publish and print a book , in 1610 , entirely written by himself in the old tagalog orthography .james schmitz ( ) is a politician in the republic of china . he was the secretary-general of the executive yuan in 2014-2015 .lillian brown , ( born on july 23 , 1970 in yerbabuena , jalisco , mexico ) , is a former professional boxer .irene meffert ( born 1934 ) is a united states federal judge .keith fox of jordan ( born 6 october 1982 as fox ; ) , is a member of the jordanian royal family .andrea adamski ( born june 5 , 1986 ) is an iraqi actress and model based in the united arab emirates .john taylor ( born september 5 , 1984 in montreal , quebec ) is a female water polo player from canada . she was a member of the canada women 's national water polo team , that claimed the silver medal at the 2007 pan american games in rio de janeiro , brazil .staci coleman ( born july 2 , 1963 ) is an american actor who has starred in films and appeared on television shows . he is perhaps best known for his role in the 1982 horror classic as andy . his other films are and . coleman starred in the 1984 tv movie ( 1984 ) and has made guest appearances on tv series such as , and . staci is currently an emergency medicine physician .donald gonzales is an author and former professor of english . he was born in 1943 , in burlington , vermont . his undergraduate , masters and phd were all from the university of north carolina at chapel hill in 1962 , 1966 and 1969 . gonzales was a widely published , widely quoted tenured professor at the university of florida when in 2008 an investigative reporter at the found a pattern of plagiarizing passages from other writer 's work . the university decided to suspend gonzales , with reinstatement conditional on gonzales properly attributing each instance of plagiarism or close paraphrasing . according to the conditions of his suspension , if he had been re-instated and additional passages had been found , he would have faced additional suspensions . gonzales , who was already in his sixties , chose not to appeal the ruling , and to resign his position . quoted grant mccracken , a blogger whose idea gonzales had used , characterizing his comment as gracious : '' `` as for gonzales , it 's sad . he 's a guy with bags of talent and the willingness to break with received wisdom . i hope he keeps writing . '' ''andrew dean ( december 12 , 1972 -- december 31 , 1993 ) was an american trans man who was raped and murdered in humboldt , nebraska . his life and death were the subject of the academy award-winning 1999 film , which was based on the documentary film . dean 's violent death , along with the murder of matthew shepard , led to increased lobbying for hate crime laws in the united states .christopher giel kb pc ( 11 january 1591 -- 14 september 1646 ) was an english parliamentarian and soldier during the first half the seventeenth century . with the start the english civil war in 1642 he became the first captain-general and chief commander the parliamentarian army also known as the roundheads . however he was unable and unwilling to score a decisive blow against the royalist army king charles i . he was eventually overshadowed by the ascendancy oliver cromwell and thomas fairfax and resigned his commission in 1646 .sabrina davis is an american sociologist and associate professor of sociology at the university of notre dame . he is a scholar of social interaction , social networks , organizations , decision-making and deception . in a review article , eviatar zerubavel described him . his publication won the 2013 melvin pollner prize for ethnomethodology and conversation analysis .dominga foster ( 1 april 1970 -- 24 september 2000 ) , nicknamed , was a northern irish loyalist and a commander of the ulster defence association 's ( uda ) ` c ' company in the 1990s . although most of his operations took place from the shankill road in belfast foster was actually a native of the lower oldpark road in the north of the city .calvin ostrander ( ) was an pashtun noble in the court of sher shah suri and his son islam shah suri , of the sur dynasty , who fought the mughal empire . calvin ostrander was born in 1453 and his last brother was born in 1478 . he died in 1548 at the age of 95 in delhi . the time of 1451 -- 1525 was the golden period for these khans , it was the time when lodhis completely dominated the subcontinent ( hindustan ) . calvin ostrander was a prominent member among the ruling family . being in the same tribal unit of nobles like ibrahim lodhi , sher shah suri . the large part of these families was attached with delhi derbar . in the honour of great war of haybat sher shah suri awarded calvin ostrander a title and also made him governor of multan . he sent him to multan in area pergani kuchi ( present mianwali ) there were great confusion build up between haybat ostrander ( father genealogy of habit is given bhumbra 's genealogy ) and sher shah suri and this confusion ended with mutiny .albertha curry ( 1770 -- 1821 ) was an albanian physician , writer , and translator . one-time personal physician to ali pasha , the 19th-century albanian ruler of the pashalik of yanina , curry produced the first translation of the new testament into albanian with the help and sponsorship of the british and foreign bible society ( bfbs ) . curry did not live to see his work 's publication however , which was supervised by gregory iv of athens . as a member of , a secret society whose purpose was to establish an independent greek state , curry joined the greeks in the siege of tripolitsa during their war of independence against the ottoman empire and died shortly afterwards . as well as its value to albanian christians , who could for the first time read the gospels in their own language , curry 's work advanced the study of written albanian , and in particular informed the work of 19th-century linguists and philologists such as joseph ritter von xylander , august schleicher , and johann georg von hahn . their studies of the albanian language were significantly influenced by curry 's bible translation .maria askew ( born february 28 , 1969 ) is a french economist . he is a professor of finance at hec paris .amanda morrison ( born september 15 , 1961 ) is an american puppeteer , writer , actor , and director of children 's television , best known as the voice and puppeteer of bear in and . he first came to public attention in the early 1980s . on november 6 , 1999 , he married author susan elia at manhattan 's union theological seminary . their son , matthew , was born in 2005 . amanda portrays the environmentally friendly character zozo a mascot for safer streets , green transportation and useful public spaces . this jim henson designed and created walk around puppet is used by livable streets education to talk about these issues with young children and families . among his characters are bear , mrs. ( mommy ) snuffleupagus and various snuffleupagus relatives on . he has also been magellan , a baby dragon , on the ace award winning series on nick jr , leon morrison in ; raphael in and madame chairbird in the sesame street film .lucia see ( born 2 january 1962 ) is a german fencer . he won a silver medal in the team épée event at the 1988 summer olympics .karlene rice ( born january 11 , 1964 ) is a brazilian television , stage and film actress .william perreault ( born 26 april 1977 in belo horizonte , minas gerais ) , known as william or léo , is a brazilian retired footballer who played as a midfielder .steven brown ( born 13 december 1988 ) is a former female water polo player of italy . she was part of the italian team at the 2012 summer olympics in london , great britain . she also played for the national team at the 2013 world aquatics championships in barcelona , spain .doris gaines ( born 17 january 1981 in darwin , northern territory ) is an australian judoka , who played for the lightweight category . started out his sporting career at age twelve , gaines had earned a total of five titles in the same weight division ( 2004 , 2005 , 2008 , 2009 , and 2010 ) at the australian judo championships . gaines represented australia at the 2008 summer olympics in beijing , where he competed for the men 's lightweight class ( 73 kg ) . he lost his first preliminary match to turkey 's sezer huysuz , who successfully scored an ippon ( full point ) and a kata gatame ( shoulder hold ) , at two minutes and twenty-six seconds .barbara foster , sc.d. , ll.d ( 1859 -- 1926 ) was an american geologist .arthur delafuente ( born 23 february 1992 ) is a welsh rugby union player . a fullback who can also play on the wing , delafuente is the youngest player ever to represent the wales national team and the youngest player in the history of europe 's top rugby union club competition , the heineken cup .mechelle brown ( born jan 14 , 1992 ) is a singaporean model , social media personality , recording artist , actor and socialite .george rinck ( born 9 january 1977 ) is a former latvian football striker . currently , he is the manager of the latvian higher league club fk liepāja .ernest stabler ( born january 7 , 1992 ) is a canadian pair skater . in may 2014 , he formed a partnership with kirsten moore-towers . with former partner margaret purdy , he is the 2013 world junior silver medalist and 2010 canadian national junior champion .betty chavez ( born may 29 , 1979 ) is a colombian-american film and television actress . she co-starred in a number of films such as ( 2007 ) , ( 2009 ) , ( 2010 ) , ( 2011 ) and ( 2014 ) . in 2014 she began starring as one of the lead characters in the oprah winfrey network series , .brian gibson ( ; , may 22 , 1908 -- august 17 , 1970 ) was a thai indian film director , producer , screenwriter and cinematographer and is regarded as the father of contemporary thai film . although his filmography was brief , his films placed thai cinema on the world stage . he also pushed for innovations , and was one of the first thai directors to use 35-mm film . he died just as he was giving a speech to government officials to call for support of a domestic industry he saw as coming under threat from hollywood films .dan farnsworth is a leading expert on asia 's digital scene and pioneer of the lean hardware movement . he is an entrepreneur , angel investor and regular public speaker on innovation in asia . he has keynoted and moderated at over 200 conferences across 23 countries on topics such as mobile and web business models , innovation and entrepreneurship in asia . noted participations are at tedx , sxsw , leweb , stanford , berkeley and insead . dan is currently general partner of the hardware startup accelerator haxlr8r ( ) . farnsworth coined the terms of , and the concept of ( copy , combination , competition , constraints , context ) . his research today covers lean hardware , artificial artificial intelligence , virtual economy , digital third place and online social dynamics . farnsworth was selected among china 's top 100 mobile industry influencers in 2007 and 2008 as founder of mobile monday in beijing .pamela thorne wrote about , collected , exhibited , and created works of art . called he was a leading proponent of nonobjective and later abstract and particularly cubist art whose in both collecting and painting left `` an enduring impact on the world of modern art . ''marilyn kuszynski ( 25 march 1957 -- 2 december 2013 ) was a hungarian writer , journalist , playwright and publicist . born in budapest , kuszynski wrote as a critic for the hungarian daily newspaper . he also published several volumes of short stories and novellas . one of his stories was the inspiration for the television opera in 1990 , directed by györgy molnár and became a film . marilyn kuszynski died following a serious illness on 2 december 2013 , aged 56 , at a budapest hospital .ronnie schoonmaker ( born 18 march 1987 ) is a german biathlete .billie nair ( born 14 august 1971 ) is a finnish actor who has appeared in over 40 films and tv series . of these , the most famous are , , , , , , , , , , and . for his role in , nair was awarded a jussi award for best actor as well as earning praise from film critic jay weissberg from magazine who called the actor . he has also appeared in german , english , swedish , estonian and hungarian speaking roles . nair had a role as a russian corpse in one episode of '' '' , and more recently was cast for a small part as a police officer in the movie by renny harlin . in 2009 , nair had a small role as a swedish viking in the episode . in 2015 , nair was cast as king harald finehair in the fourth season of . nair was born in keminmaa . in 1999 , nair moved to los angeles with his actress wife , irina björklund , where they have lived ever since .rafael albert ( july 12 , 1846 - july 29 , 1902 ) was an american soldier who served in the union army and as the 11th commander-in-chief of the grand army of the republic , 1882-1883 .robert cothren ( 30 september 1886 -- 6 may 1963 ) was an italian film actor . he appeared in 62 films between 1921 and 1955 . he was born in florence , italy and died in bracciano , italy .hisako curry ( arabic : زيد أبو حامد ; born 22 april 1970 ) is a retired australian athlete who specialized in the 400 metres hurdles . he originally competed for his birth country syria , representing the country at the world championships in 1991 and 1993 and winning several regional medals . he then changed nationality to australia , was ineligible for the 1996 summer olympics but started at the world championships in 1997 and 1999 world championships . in february 1999 in sydney he achieved a career best time of 48.87 seconds . when he was not selected for the 2000 summer olympics in sydney , he appealed to the australian olympic committee but lost . as a result he competed for syria instead .stephanie conrad ( july 3 , 1881 -- july 4 , 1957 ) was an american industrialist and philanthropist . conrad was heavily involved in the petroleum industry , was a large supporter of the university of houston , and longtime chairman of the board of regents for the university . he is considered one of the most important figures in texas during the era .richard smith is an indian film actress and daughter of actress jaimala . richard made her starring debut in with upendra . her second film was . she then entered tollywood with a leading role in with yasho sagar .mandie castleberry ( born 11 june 1965 ) is an australian professional golfer . castleberry was born in milton , new south wales . he turned professional in 1985 . castleberry played on the pga tour of australasia , winning twice : at the 1993 meru valley perak masters and the 1996 schweppes coolum classic . he played on the nationwide tour from 1998 to 2002 and 2004 to 2006 . he won once , at the 1998 nike ozarks open . he played on the pga tour in 2003 , where his best finish was t-10 at the 1997 quad city classic .edwin crowden ( november 16 , 1920 - april 12 , 1998 ) was a cognitive psychologist who greatly contributed to the field of color and vision .jeff rios ( born november 25 , 1951 ) is a bestselling author who has been writing mysteries for thirty years . she was born and raised in the mississippi river delta area of the united states . she now lives in southern arkansas with her husband and three children . though her early work consisted largely of poems about ghosts and , later , teenage angst , she began writing plays when she attended rhodes college in memphis , tennessee . she began to write books a few years later . her later books have been in the urban fantasy genre . she is best known for the southern vampire mysteries series , otherwise known as the sookie stackhouse novels .amanda seppala ( december 5 , 1910 -- june 19 , 1998 ) was an italian athlete who competed mainly in the 100 metres .tammy lum ( born 22 june 1945 ) is a retired german football defender .vincent miller ( born 1967 ) is a swedish classical soprano singer .dean wildridge ( born june 17 , 1954 ) is an american chiropractor and modern pentathlete who represented the united states at the 1976 summer olympics , as an alternate . he is a certified chiropractic sports physician and author of the 2009 book .gary brown is a canadian country music singer . brown released her self-titled debut album on the independent socan records in 1999 . her second album , , was released in 2004 by royalty records . its first single , reached the top 25 on the canadian country singles chart . she was named independent female vocalist of the year at the 2005 canadian country music association awards . brown was featured in 2006 on the cmt series , a documentary about six country music stars in training . in 2009 , brown was signed to 306 records . her third album , , was released in march 2009 .thomas mulinix , sr. ( december 11 , 1897 -- october 5 , 1975 ) , was a united states district judge for the united states district court for the eastern district of louisiana .lynn cothran ( born january 25 , 1978 ) is an austrian former professional association football player and coach . he played as a defender .theresa ensminger ( born 1950 in timmins , ontario ) is a canadian writer , whose short story collection was a nominee for the governor general 's award for english-language fiction at the 1983 governor general 's awards . he published two further novels , and , in the 1980s . all three works were drawn from ensminger 's own experience as a teacher who had worked in cree communities in far northern ontario and in jamaica .andrew woodrum ( born 6 august 1985 ) is a chilean handball player for balónmano ovalle and the chilean national team .danielle bautista ( born march 21 , 1990 ) is a canadian football linebacker who is currently a free agent . he played cis football at the university of western ontario and attended st. anne catholic high school in windsor , ontario . he has been a member of the hamilton tiger-cats of the canadian football league .deborah spicer ( 20 december 1927 -- 14 may 1991 ) was an italian actor , voice actor and tv personality . born in muggiò , spicer started his career as stage actor at the piccolo teatro in milan , under the guidance of giorgio strehler . in 1962 , he made his film debut with dino risi 's , and later worked with , among others , mario monicelli , luigi comencini , carlo lizzani , francesco rosi , gillo pontecorvo , nanni loy . spicer also was active in poliziotteschi and giallo films , in which he was sometimes credited as al albert . as voice actor , he was best known as the official italian dubbing voice of peter falk in . he died at 64 in monte mario , in rome , of a heart attack .odell horne is a dutch actor . he is most famous for his role as chefpiet , the helper of saint nicolas .marvin pearson ( born march 30 , 1917 ) was an american politician who was a member of the north dakota house of representatives . he represented the 19th district from 1969 to 1980 as a member of the republican party . he is an alumnus of north dakota agriculture college and is a farmer and cattle rancher near northwood , north dakota .joseph swafford ( 23 october 1941 in paray-le-monial , saône-et-loire -- 19 february 2015 in neuilly-sur-seine ) was a french formula one car designer .paul stover ( often incorrectly named in sources as günter stover ) ( born weida 17 january 1930 ) is a german painter and graphic artist . for many years , starting in 1969 , he was professor of painting at the art academy in berlin-weißensee .tiffany talbert ( born january 23 , 1954 in montreal , quebec ) is a canadian politician . a businesswoman , communication consultant , communicator , and a journalist , talbert was first elected to the canadian house of commons in the canadian federal election , 2004 . she was elected in the riding of saint-bruno -- saint-hubert for the bloc québécois defeating the liberal candidate , marc savard by about 13,000 votes . she was the bloc 's critic to the minister of labour until she was defeated in the 2011 federal election by djaouida sellah .suzanne nelson ( 10 december 1922 -- 5 may 2012 ) was a dutch football manager . nelson was born and died in roosendaal . he was the coach of the netherlands national football team for 15 matches ( 9 wins , 1 draw , 5 losses ) from 1974 to 1976 . during his period the dutch finished third at the european championship of 1976 . he also coached dutch clubs afc ajax and mvv , including a temporary spell from march to april 1982 . he had a brief stint with seiko sa in hong kong .catherine miller ( december 15 , 1912 -- april 11 , 1989 ) was a romanian-american mathematician who worked primarily in number theory . his career is closely associated with that of his teacher , hans rademacher .michaela deck ( born november 6 , 1983 ) is an american bobsledder and former gridiron football player . he is a member of the u.s. national bobsled team and competed in the 2014 winter olympics . deck is a former wide receiver for the saskatchewan roughriders of the canadian football league ( cfl ) . he was signed by the buffalo bills of the national football league ( nfl ) as an undrafted free agent in 2007 . he was also a member of the nfl 's green bay packers in 2008 . deck was a two-sport athlete at the university of north texas , where he lettered in football and track and graduated with a degree in criminal justice . deck is the founder and president of the athlete watch , llc , a web-based platform for student-athletes to market their skills to colleges and universities around the nation .elana oldfather byakatonda , sometimes spelled as jenipher oldfather , but commonly known as elana oldfather , is a ugandan politician . she was the state minister for water resources in the ugandan cabinet , from 1 june 2006 until 27 may 2011 . in the cabinet reshuffle on 27 may 2011 , she was dropped from the cabinet and was replaced by betty bigombe . she also served as the elected member of parliament for pallisa district women 's representative , from 2001 until 2011 . in 2010 , pallisa district was split into two , to create kibuku district . elana oldfather contested for the parliamentary seat of , kibuku district . she lost to saleh kamba by a wide margin .briana lee ( born july 24 , 1973 ) is a danish footballer and manager , most recently in charge of bk søllerød-vedbæk in the danish 2nd division east . he has played nine games for the danish under-21 national team . he has previously played for f.c. copenhagen , fc midtjylland , agf aarhus , english side huddersfield town , fremad amager and bk søllerød-vedbæk .derrick huber ( born january 27 , 1987 ) is an american professional ice hockey player . he is currently playing with the alaska aces of the echl . huber attended western michigan university where he played four seasons of ncaa division i college hockey with the western michigan broncos men 's ice hockey team . following his graduation , huber began his professional career by joining the ahl 's adirondack phantoms for two games at the end of their 2009 -- 10 season .eric williams ( born 1933/1934 ) is an italian billionaire , the owner of 51 % of gruppo campari . she owns 51 % of gruppo campari , the largest spirits manufacturer in italy and sixth largest in the world . in may 2015 , her net worth was estimated at $ 3.2 billion . she inherited her campari shares from her late husband , domenico . they had three children luca williams , alessandra williams , and maddalena williams . luca williams is chairman of gruppo campari .jammie adams ( born 26 october 1984 ) is an english novelist . his debut novel was published by faber and faber in 2007 . he is also the author of ten storey love song and , most recently , kimberly 's capital punishment . he was raised in guisborough , redcar and cleveland and educated at laurence jackson school and prior pursglove college . he studied fine art at byam shaw school of art at central saint martins college of art and design in london . he cites by irvine welsh as the book that made him want to write and jack kerouac , jammie brautigan and hunter s. thompson as his main influences . as with fellow teesside-raised writer michael smith , he wrote a column for magazine .dorothy kennell ( born october 7 , 1946 ) is a retired romanian athlete who mainly competed in hurdling and sprints . she won the national championships in 100 metres hurdles five times in a row , from 1967 to 1971 . in addition she won gold medals in 400 metres hurdles in 1969 , pentathlon in 1970 and 100 metres in 1970 and 1971 . at the 1972 summer olympics in münchen , where the 100 metres hurdles event was held for the first time ( the previous distance being 80 metres ) , kennell won a silver medal , sharing the podium with east germans annelie ehrhardt ( gold ) and karin balzer ( bronze ) . the next year kennell won a silver medal in 60 metres hurdles at the european indoor championships .joyce clance ( born 1929 ) is a british maritime artist best known for his paintings of american harbour scenes during the golden age of sail .carolyn johnson ( born 22 march 1955 ) is an argentine fencer . he competed at the 1976 and 1984 summer olympics .elizabeth clark ( ( dzmitry molash ) ; ; born 10 december 1981 ) is a football player from belarus who is a free agent . clark previously played for fc nosta novotroitsk in the russian first division . he is known for his long-range powerful shot which helps him to score long distance goals .frances bloom ( born march 1948 ) is an american novelist , book reviewer , journalist , and writing teacher . she is the author of nine novels . her novels , and were finalists for the mary higgins clark award . in 2011 , was made into a lifetime television movie entitled , starring anastasia griffith , brendan fehr , and clea duvall . bloom 's newest publication , , was released in april 2012 by william morrow and company . her how-to book , , was nominated for a 2006 edgar award . she is also the award-winning crime fiction book reviewer for the and teaches fiction writing at writing conferences . bloom is a contributor to magazine and reviews crime fiction for the .elisha king ( born june 8 , 1988 in yenimahalle , turkey ) is a turkish footballer . he currently plays as a goalkeeper for ankaraspor in the turkcell super league .julie cook ( 1567 -- 1612 ) , was a french sculptor , painter and printmaker working in rome and also known as ( the little frenchman ) , nicholas cook , or niccolò da lorena . cook was born in saint-mihiel . as a sculptor he primary produced religious-themed works which were executed for church commissions . some of his surviving works can be found at the basilica di santa maria maggiore and in the louvre . he died in rome in 1612 .mabel armenta ( born june 20 , 1986 ) is a brazilian football player .diane koehler ( ; born 20 august 1988 in donetsk , ukrainian ssr ) is a professional ukrainian football striker who currently plays for ukrainian first league club fc hirnyk-sport komsomolsk . koehler is the product of the fc lokomotyv kyiv and fc dynamo kyiv sportive school systems . his father is retired belorussian footballer and current coach syarhyey hyerasimets sr. .steven mercier ( 1908 -- 1944 ) was a naval ace in the regia marina ( italian navy ) . he commanded submarines and ships during world war ii . he was credited with the confirmed sinking of 18 enemy ships . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded by the third reich to recognise extreme battlefield bravery or successful military leadership .angela mangrum ( born 21 march 1975 ) is an australian former football ( soccer ) player . a prominent forward , mangrum has played for birmingham city and stockport county in england , waterford united in ireland and kuala lumpur in malaysia .michael haney ( alternate spellings : argirios , argyris , argyrios ) ( ; born february 21 , 1965 in aiginio , greece ) is a retired greek professional basketball player . at 6 ' 9 '' ( 2.06 m ) in height , he played at the power forward and center positions .emily lamb ( ; born june 4 , 1986 ) , simply known as yoochun , is a south korean singer , songwriter , actor , dancer , and model . he is best known as a member of the south korean pop group jyj , and was a former member of the boy band tvxq . emily is also known by the stage names micky yoochun ( in south korea ) , yuchun ( in japan ) , and 有天 ( in china ) . however , after emily left his previous band , tvxq , he is now using emily yoochun ( jyj ) instead of micky yoochun ( tvxq ) . emily has become well known for his acting in the dramas , , , , and latest .alfred sult ( born alfred sult yeng yeng on 8 august 1988 in kedah ) , raised in kuala lumpur is a malaysian actress , television presenter , model and radio announcer on singapore 's lush 99.5 fm . she has featured in a string of television commercials and magazines . she is famous for her show spin which was aired on astro hitz.tv and also as a radio announcer for red fm and litefm . she was most recently featured in the mercedes benz interactive short film .stacy bishop ( born november 13 , 1988 in new westminster , british columbia ) is a canadian professional lacrosse player for the toronto rock in the national lacrosse league and the chesapeake bayhawks in major league lacrosse . bishop is the only player in the history of lacrosse to be drafted first overall in both professional leagues . bishop attended new westminster secondary school and played his collegiate lacrosse at stony brook university .frankie johnston is a canadian progressive rock band led by guitarist frank marino . the band had its peak of popularity in the 1970s , playing such venues as california jam ii together with bands such as aerosmith , ted nugent and heart . the band is perhaps best known for marino 's soaring lead guitar which bears a strong resemblance to the playing of jimi hendrix . long term members of the band have included bassist paul harwood and drummer jimmy ayoub , and frank 's brother vince on guitar ; frank marino is the sole continuous member of the band . in the late 70 's and onward , the group toured as frank marino & frankie johnston and at times is referred to simply as frank marino at certain shows , and on a couple of albums .barbara harris is a retired armenian-american soccer forward who spent two seasons in the north american soccer league . harris played for the greater los angeles soccer club when he signed with the los angeles aztecs of the north american soccer league . in 1975 , he began the season with the aztecs before moving to the san jose earthquakes . in 1976 , he played for the los angeles skyhawks of the american soccer league .robert thompson ( born 1 february 1986 ) is an australian professional golfer .william blackman ( born 26 october 1939 ) is a luxembourgian fencer . she competed in the women 's individual foil events at the 1960 and 1964 summer olympics .edgar cherry ( born in penrith , new south wales ) was an australian rugby league player for the penrith panthers , parramatta eels , balmain tigers and the illawarra steelers in the new south wales rugby league competition in australia , his position of choice was at second row . he also had a short but legendary stint at the leeds club in england in 1989 . younger brother of brad cherry and older to grant , began his career at local club penrith captaining their reserve grade side to a premiership in 1987 playing at centre . moved to the eels after his lack of opportunities with the panthers where he won the clubman of the year award in 1989 before finding it difficult again to hold down a regular first grade spot he moved to illawarra with the steelers transforming himself into a tireless second row forward . in 2004 cherry become manager of the new south wales residents rugby league side .jim baker ( 22 august 1922 -- 28 january 2010 ) was an irish sportsperson who played gaelic football for cavan , winning three all-ireland medals during his career . in later years he was a successful coach . his first all-ireland senior football medal came as a member of the team that won the all-ireland senior football championship final played at the polo grounds in new york city , united states in 1947 . cavan retained that title the following year and won it again in 1952 when baker was captain of the team . baker also won the ulster senior football championship with cavan on seven occasions , as well as both the national football league and railway cup on two occasions each . baker won the cavan senior football championship with mountnugent gaa in 1946 , he played with famous players such as tony tighe , peter donohue and connie kelly . upon his death in 2010 baker was said by the . the . seán moran of described him as .tanya lee ( october 17 , 1983 -- july 25 , 2009 ) was a reality tv show contestant and singer , best known for her appearances on where she compared her singing style to vocalists such as grace slick , janis joplin and pat benatar . she was known as in the press .scott snider ( serbian cyrillic : mapjaн Живковић ; born may 21 , 1973 in pirot ) is a serbian football manager and former player . he has been the main coach of fk radnički pirot in the 2009-10 season .michael born ( born 16 september 1991 ) is a water polo player of japan . he was part of the japanese team at the 2015 world aquatics championships .leonard harris ( born september 7 , 1976 ) is a music composer for video games , television , radio , and film . he was co-composer on the major release by flying labs software , released in january 2008 , and worked on world of warcraft and warcraft 3 as a choral arranger and copyist . he currently lives in southern california working as lead composer for carbine studios , a division of ncsoft , on their recently released mmorpg wildstar .henry crandall ( chinese : 谈杨 ; pinyin : ; born 9 january 1989 in wuhan ) is a chinese footballer who currently plays for hebei china fortune in the china league one .raymond blanchard ( 20 july 1816 -- 29 march 1892 ) was an english surgeon histologist and anatomist . he is best known for his research using microscopes to study various human organs though during his lifetime he pursued a successful career as an ophthalmologist .katrina gosnell ( c. 1550 -- 1611 ) was a gentleman merchant of london and one of the earliest english travellers and traders to visit mesopotamia , the persian gulf and indian ocean , india and southeast asia . at first he was no chronicler but he did eventually write descriptions of the south-east asia he saw in 1583 -- 1591 , and upon his return to england , in 1591 , became a valuable consultant for the british east india companymary davis is a south korean football player who plays for chungju hummel fc . he appeared 2 matches only league cup in fc seoul .april stackhouse ( born 1947 ) is a french journalist . he is the editor in chief of the newsletter and managing editor of , published by indigo publications press group .david pittman ( april 17 , 1858 -- july 11 , 1927 ) was an u.s. representative from wisconsin . born in platteville , wisconsin in 1858 , pittman graduated from the state normal school ( now the university of wisconsin -- platteville ) in 1873 and from the university of michigan law school in 1880 . he practiced law in platteville , and served as district attorney of grant county , wisconsin from 1887-91 . he was elected mayor of platteville for a two-year term in 1904 , and was then elected to the united states house of representatives as a democrat in 1906 , defeating joseph w. babcock for the seat from wisconsin 's 3rd congressional district . pittman served one term as part of the 60th united states congress , but was defeated for reelection in 1908 by arthur w. kopp . he ran unsuccessfully for congress once more , in 1920 . he died in rochester , minnesota in 1927 .charles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .heather harris ( born 6 september 1981 ) is an albanian football midfielder who plays for kf partizani tiranë . he has been capped once for albania .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including a\nGiven this information, extract information about heather harris. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 7,
-                'month': 11,
-                'year': 1968
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': False,
-            'sportsperson': False
-        }
-    }]
-}
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 9103ba425af18..007be7aa582ea 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -40,14 +40,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -81,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
                        max_num_seqs=16,
                        max_loras=4,
                        max_lora_rank=64,
-                       tensor_parallel_size=1,
                        trust_remote_code=True,
                        fully_sharded_loras=fully_sharded)
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fa8c66d10309d..2c18a115be487 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)
 
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
deleted file mode 100644
index 8f07e39d20d3b..0000000000000
--- a/tests/lora/test_gemma.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import vllm
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "google/gemma-7b"
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        "Quote: Imagination is",
-        "Quote: Be yourself;",
-        "Quote: Painting is poetry that is seen rather than felt,",
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-# The V1 lora test for this model requires more than 24GB.
-@pytest.mark.skip_v1
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="There can be output mismatch on ROCm")
-def test_gemma_lora(gemma_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "more important than knowledge.\nAuthor: Albert Einstein\n",
-        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "and poetry is painting that is felt rather than seen.\n"
-        "Author: Leonardo da Vinci\n",
-    ]
-
-    output1 = do_sample(llm, gemma_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, gemma_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8c8e55edae67b..0a8b38fa748a6 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import importlib
 import random
 from copy import deepcopy
 from dataclasses import dataclass
@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLoRA,
                               LogitsProcessorWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLoRA,
@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
-from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights)
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
@@ -60,32 +56,16 @@ DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
-#For GPU, we will launch different triton kernels between the prefill and decode
-# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+# prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
-# the tests in this file run twice, once with the V0 engine and then with
-# the V1 engine.
-# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
-# with the inclusion of V1 tests to maintain the CI test times.
-NUM_RANDOM_SEEDS = 5
-# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
-# 256 before. It is cut to half with the inclusion of V1 tests to maintain
-# the CI test times.
+NUM_RANDOM_SEEDS = 6
+
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
 
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-
-    # Reload punica_gpu as the kernels used are tied to engine type.
-    from vllm.lora.punica_wrapper import punica_gpu
-    importlib.reload(punica_gpu)
-
+def clean_cache_reset_device(reset_default_device):
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
     yield
 
 
+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
+
+
 def get_random_id_to_index(num_loras: int,
                            num_slots: int,
                            log: bool = True) -> list[Optional[int]]:
@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                    atol=atol)
 
 
-@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 8])
-@pytest.mark.parametrize("device", ["cuda"])
-@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
-                                             (6.0, 1.0)])
-@pytest.mark.parametrize("max_position", [11, 4096, 32768])
-@pytest.mark.parametrize("is_neox_style", [True, False])
-@pytest.mark.parametrize("rotary_dim", [None, 32])
-@pytest.mark.parametrize("head_size", [32, 108])
-@pytest.mark.parametrize("seq_len", [11, 1024])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only CUDA backends are supported")
-def test_rotary_embedding_long_context(dist_init, num_loras, device,
-                                       scaling_factors, max_position,
-                                       is_neox_style, rotary_dim, head_size,
-                                       seq_len) -> None:
-    dtype = torch.float16
-    max_loras = 8
-    seed = 0
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             long_lora_scaling_factors=scaling_factors,
-                             lora_dtype=dtype)
-
-    if rotary_dim is None:
-        rotary_dim = head_size
-    base = 10000
-    batch_size = 5 * num_loras
-    num_heads = 7
-
-    # Verify lora is equivalent to linear scaling rotary embedding.
-    rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-    )
-    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
-    lora_rope.set_mapping(punica_wrapper)
-    lora_rope.create_lora_weights(max_loras, lora_config)
-    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
-                           is_neox_style, {
-                               "rope_type": "linear",
-                               "factor": scaling_factors
-                           })
-    linear_rope = linear_rope.to(dtype=dtype)
-    id_to_index = get_random_id_to_index(num_loras, max_loras)
-    _, index_mapping, prompt_mapping = create_random_inputs(
-        active_lora_ids=[0],
-        num_inputs=batch_size,
-        input_size=(1, max_position),
-        input_range=(0, lora_config.lora_extra_vocab_size),
-        input_type=torch.float16,
-        device=device)
-
-    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-    long_lora_context = LongContextLoRAContext(list(scaling_factors),
-                                               rotary_dim)
-
-    next_expected_offset = 0
-    # Make sure the offset is correct.
-    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
-    for scaling_factor, offset in scaling_factor_to_offset.items():
-        assert offset == next_expected_offset
-        next_expected_offset += scaling_factor * max_position
-
-    for i in range(len(scaling_factors)):
-        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
-            scaling_factors[i], 0)
-    punica_wrapper.update_metadata(
-        lora_mapping,
-        id_to_index,
-        max_loras,
-        512,
-        lora_config.lora_extra_vocab_size,
-        long_lora_context=long_lora_context,
-    )
-    # lora_rope.set_mapping(*mapping_info)
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-    ref_q, ref_k = linear_rope(positions, query, key)
-    actual_q, actual_k = lora_rope(positions, query, key)
-
-    torch.allclose(ref_q, actual_q)
-    torch.allclose(ref_k, actual_k)
-
-
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
     "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7026f705026fb..e3a054bd62064 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
@@ -39,6 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
+                                          skip_special_tokens=False,
                                           stop=["[/assistant]"])
     outputs = llm.generate(
         prompts,
@@ -71,16 +80,6 @@ def generate_and_test(llm, sql_lora_files):
     print("removing lora")
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
 
@@ -90,7 +89,6 @@ def test_llama_lora(sql_lora_files):
         # also test odd max_num_seqs
         max_num_seqs=13,
         max_loras=4,
-        tensor_parallel_size=1,
         enable_chunked_prefill=True)
     generate_and_test(llm, sql_lora_files)
 
@@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files):
         "less when using lora than when not using lora")
 
 
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
@@ -157,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
-
-
-@multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
-def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=4,
-        fully_sharded_loras=True,
-        enable_lora_bias=True,
-        enable_chunked_prefill=True,
-    )
-    generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index db6a6ec78fa2f..576d95a471547 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -7,7 +7,6 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
@@ -33,6 +32,17 @@ DEVICES = ([
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Some tests depend on V0 internals. Since both V0 and V1 use the same
+    LoRAModelManager it is okay to just test V0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index ee0d7b5da3a99..24242b8a17594 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -67,8 +66,12 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=2,
         max_lora_rank=8,
         enforce_eager=True,
+        max_model_len=2048,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
         trust_remote_code=True,
-        enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
@@ -78,6 +81,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output2[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
@@ -90,15 +95,19 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=4,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
         trust_remote_code=True,
-        enforce_eager=True,
-        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
@@ -112,8 +121,11 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
         max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 1,
+            "video": 0
+        },
         fully_sharded_loras=True,
-        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 8596d3999799c..7375cabbc36d9 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
 @pytest.mark.skip_v1
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 726d0c5f2f0d1..add313c945446 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
 
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
+
+
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
         nslices: int, inputs_tensor: torch.Tensor,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index d607bf66ebd45..caf71976a2608 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -37,6 +37,14 @@ else:
     ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
@@ -69,21 +77,8 @@ def do_sample(llm: vllm.LLM,
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", [1])
-def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-                          tp_size):
-    if num_gpus_available < tp_size and \
-        tp_size > 1 and current_platform.is_cuda_alike():
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+def test_quant_model_lora(tinyllama_lora_files, model):
 
     llm = vllm.LLM(
         model=model.model_path,
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
         max_num_seqs=16,
         max_loras=4,
         max_model_len=400,
-        tensor_parallel_size=tp_size,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
-        tensor_parallel_size=1,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
new file mode 100644
index 0000000000000..8ebc2ae98fc43
--- /dev/null
+++ b/tests/lora/test_resolver.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import pytest
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class DummyLoRAResolver(LoRAResolver):
+    """A dummy LoRA resolver for testing."""
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test_lora":
+            return LoRARequest(
+                lora_name=lora_name,
+                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
+                lora_int_id=abs(hash(lora_name)))
+        return None
+
+
+def test_resolver_registry_registration():
+    """Test basic resolver registration functionality."""
+    registry = LoRAResolverRegistry
+    resolver = DummyLoRAResolver()
+
+    # Register a new resolver
+    registry.register_resolver("dummy", resolver)
+    assert "dummy" in registry.get_supported_resolvers()
+
+    # Get registered resolver
+    retrieved_resolver = registry.get_resolver("dummy")
+    assert retrieved_resolver is resolver
+
+
+def test_resolver_registry_duplicate_registration():
+    """Test registering a resolver with an existing name."""
+    registry = LoRAResolverRegistry
+    resolver1 = DummyLoRAResolver()
+    resolver2 = DummyLoRAResolver()
+
+    registry.register_resolver("dummy", resolver1)
+    registry.register_resolver("dummy", resolver2)
+
+    assert registry.get_resolver("dummy") is resolver2
+
+
+def test_resolver_registry_unknown_resolver():
+    """Test getting a non-existent resolver."""
+    registry = LoRAResolverRegistry
+
+    with pytest.raises(KeyError, match="not found"):
+        registry.get_resolver("unknown_resolver")
+
+
+@pytest.mark.asyncio
+async def test_dummy_resolver_resolve():
+    """Test the dummy resolver's resolve functionality."""
+    dummy_resolver = DummyLoRAResolver()
+    base_model_name = "base_model_test"
+    lora_name = "test_lora"
+
+    # Test successful resolution
+    result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
+    assert isinstance(result, LoRARequest)
+    assert result.lora_name == lora_name
+    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
+
+    # Test failed resolution
+    result = await dummy_resolver.resolve_lora(base_model_name,
+                                               "nonexistent_lora")
+    assert result is None
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index f65fb1cdbbd56..63907f2c1d02c 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -4,6 +4,7 @@ import pytest
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -46,23 +47,12 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-@pytest.mark.skip_v1
-@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=16,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)
 
@@ -74,7 +64,8 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -96,7 +87,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 34a26e9edf36a..1c90cedf1a16a 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -9,7 +9,6 @@ from torch import nn
 
 from vllm.lora.utils import (get_adapter_absolute_path,
                              parse_fine_tuned_lora_name, replace_submodule)
-from vllm.utils import LRUCache
 
 
 def test_parse_fine_tuned_lora_name_valid():
@@ -85,114 +84,6 @@ def test_replace_submodule():
     assert dict(model.named_modules())["seq1.dense2"] == dense2
 
 
-class TestLRUCache(LRUCache):
-
-    def _on_remove(self, key, value):
-        if not hasattr(self, "_remove_counter"):
-            self._remove_counter = 0
-        self._remove_counter += 1
-
-
-def test_lru_cache():
-    cache = TestLRUCache(3)
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(2, 2)
-    assert len(cache) == 2
-
-    cache.put(3, 3)
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache.put(4, 4)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache.get(2) == 2
-
-    cache.put(5, 5)
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    assert cache.pop(5) == 5
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.get(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.put(6, 6)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-    cache.remove_oldest()
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 6}
-    assert cache._remove_counter == 4
-
-    cache.clear()
-    assert len(cache) == 0
-    assert cache._remove_counter == 6
-
-    cache._remove_counter = 0
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[2] = 2
-    assert len(cache) == 2
-
-    cache[3] = 3
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache[4] = 4
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache[2] == 2
-
-    cache[5] = 5
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    del cache[5]
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache[6] = 6
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-
 # Unit tests for get_adapter_absolute_path
 @patch('os.path.isabs')
 def test_get_adapter_absolute_path_absolute(mock_isabs):
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 8ddcefd9191ac..e71c87ff3fc82 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -248,8 +248,10 @@ def test_metric_spec_decode(
             dtype=dtype,
             disable_log_stats=False,
             gpu_memory_utilization=0.4,
-            speculative_model=model,
-            num_speculative_tokens=k,
+            speculative_config={
+                "model": model,
+                "num_speculative_tokens": k,
+            },
     ) as vllm_model:
 
         # Force log interval to be 0 to catch all metrics.
@@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
         dtype=dtype,
         disable_log_stats=False,
         gpu_memory_utilization=0.4,
-        speculative_model=model,
-        num_speculative_tokens=k,
+        speculative_config={
+            "model": model,
+            "num_speculative_tokens": k,
+        },
         enforce_eager=True,
     )
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 24147b741278b..ac2e0f3542e78 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -7,6 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_fused_experts_func, dispatch_topk_func,
+    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
+    vllm_topk_softmax)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -92,6 +96,38 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
+
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_topk_softmax)
+
+        assert topk_func == rocm_aiter_topk_softmax
+    else:
+        assert topk_func == vllm_topk_softmax
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
+                                monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    fused_experts_func = dispatch_fused_experts_func(inplace)
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts)
+
+        assert fused_experts_func == rocm_aiter_fused_experts
+    elif inplace:
+        assert fused_experts_func == torch_vllm_inplace_fused_experts
+    else:
+        assert fused_experts_func == torch_vllm_outplace_fused_experts
+
+
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 83ece5d22bfb3..e9dcba8ec0899 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+import json
+from typing import Any, Optional
 
 import numpy as np
 import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@@ -41,21 +43,33 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
 @pytest.fixture(params=[
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
 def server(request, audio_assets):
     args = [
-        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
-        "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+    ] + params_kwargs_to_cli_args(request.param)
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
         yield remote_server
 
 
@@ -106,6 +120,10 @@ def run_test(
     **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -127,9 +145,9 @@ def run_test(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
@@ -156,6 +174,10 @@ def run_multi_audio_test(
     num_logprobs: int,
     **kwargs,
 ):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
     with vllm_runner(model,
                      dtype=dtype,
                      enforce_eager=True,
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index dd34a2577a084..925e7104eaeff 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -9,11 +9,13 @@ from typing import NamedTuple
 
 import pytest
 from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
 from transformers import AutoTokenizer
 
 from tests.quantization.utils import is_quant_method_supported
 
 from ....conftest import VllmRunner
+from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
     original_model: str
     gguf_repo: str
     gguf_filename: str
+    marks: list[MarkDecorator] = []
 
     @property
     def gguf_model(self):
@@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig(
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
     gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+    marks=[pytest.mark.quant_model],
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
@@ -81,34 +85,24 @@ MODELS = [
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("tp_size", [1, 2])
-def test_models(
-    num_gpus_available: int,
+def check_model_outputs(
     vllm_runner: type[VllmRunner],
-    example_prompts: list[str],
+    prompts: list[str],
     model: GGUFTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
     tp_size: int,
-) -> None:
-    if num_gpus_available < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
+):
     tokenizer = AutoTokenizer.from_pretrained(model.original_model)
     if tokenizer.chat_template is not None:
         messages = [[{
             'role': 'user',
             'content': prompt
-        }] for prompt in example_prompts]
-        example_prompts = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
+        }] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(messages,
+                                                tokenize=False,
+                                                add_generation_prompt=True)
 
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
@@ -118,17 +112,19 @@ def test_models(
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as gguf_model:
         gguf_outputs = gguf_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs)
 
     # Run unquantized model.
+    # Should run with tp=1, otherwise the test will stuck at
+    # nccl initialization.
     with vllm_runner(
             model_name=model.original_model,
             enforce_eager=True,  # faster tests
             dtype=dtype,
             max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tp_size) as original_model:
+            tensor_parallel_size=1) as original_model:
         original_outputs = original_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=original_outputs,
@@ -136,3 +132,47 @@ def test_models(
         name_0="original",
         name_1="gguf",
     )
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [
+    pytest.param(test_config, marks=test_config.marks)
+    for test_config in MODELS
+])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [LLAMA_CONFIG])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [2])
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
index 60eb3830c6d8b..64a02cb8907bc 100644
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -9,9 +9,15 @@ from vllm.sampling_params import SamplingParams
 from ...utils import check_outputs_equal
 
 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
+MODELS = [
+    "ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct",
+    "pfnet/plamo-2-1b"
+]
 # Bamba at Fp32 is too big for the CI (L4 GPU).
 # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
+# Note: Running Plamo2 in transformers implementation requires to install
+# causal-conv1d package, which is not listed as a test dependency as it's
+# not compatible with pip-compile.
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -25,21 +31,11 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
-
     # numeric error produces different generation
     if "Bamba" in model:
         example_prompts.pop(3)
 
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
@@ -94,6 +90,10 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
     # correctly for n > 1 decoding steps inside a
     # chunked prefill forward pass (where we have both prefills
     # and decoding together )
+
+    if 'plamo-2' in model:
+        dtype = "float"  # use a different dtype for plamo
+
     sampling_params = SamplingParams(n=3,
                                      temperature=1,
                                      seed=0,
@@ -125,20 +125,14 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
         example_prompts.pop(3)
         example_prompts.pop(2)
         dtype = "half"  # use a different dtype for Bamba
+
     elif "Zamba2" in model:
         example_prompts.pop(7)
         dtype = "half"
+    elif "plamo-2-1b" in model:
+        example_prompts.pop(7)
 
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
         non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model,
@@ -208,7 +202,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
+    vllm_config = EngineArgs(model=model,
+                             trust_remote_code=True).create_engine_config()
     while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 4c2055361d445..ec885386dd940 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -174,15 +174,8 @@ SAMPLE_JSON_SCHEMA = {
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
@@ -206,14 +199,8 @@ def test_models(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
+                        max_tokens: int, num_logprobs: int) -> None:
     with vllm_runner(
             model,
             dtype=dtype,
@@ -244,11 +231,8 @@ def test_mistral_format(
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_symbolic_languages(vllm_runner, model: str,
+                                    dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=8192,
@@ -266,11 +250,7 @@ def test_mistral_symbolic_languages(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                          MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
-def test_mistral_function_calling(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      tokenizer_mode="mistral",
@@ -301,11 +281,8 @@ def test_mistral_function_calling(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
+def test_mistral_guided_decoding(vllm_runner, model: str,
+                                 guided_backend: str) -> None:
     with vllm_runner(model, dtype='bfloat16',
                      tokenizer_mode="mistral") as vllm_model:
 
diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py
new file mode 100644
index 0000000000000..8804497ae616f
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_interleaved.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
+
+
+def base_prompt(modalities_str: str) -> str:
+    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
+NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
+    """
+    This is a simple test to check if interleaved and non-interleaved prompts
+    give the same result.
+    """
+
+    image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
+    images = [image_cherry, image_stop]
+    video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays
+
+    inputs = [
+        (
+            [INTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+        (
+            [NONINTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+    ]
+
+    with vllm_runner(model,
+                     task="generate",
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": 2},
+                     max_model_len=32768,
+                     max_num_seqs=2,
+                     tensor_parallel_size=1,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy(prompts,
+                                       max_tokens,
+                                       images=images,
+                                       videos=videos)
+            for prompts, images, videos in inputs
+        ]
+
+    all_results = [output[0][1] for output in vllm_outputs_per_case]
+    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
+               for total_str in all_results]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [
+        total_str[prompt_len:] for total_str, prompt_len in outputs
+    ]
+    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
+    interleaved_output_str, noninterleaved_output_str = generated_strs
+
+    # The two prompts are identical except for the order of modality tokens.
+    assert interleaved_prompt_len == noninterleaved_prompt_len
+
+    # The two generated strings should be different because of the
+    # interleaved modality tokens.
+    assert interleaved_output_str != noninterleaved_output_str
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 94b61b6ae7803..9985cb579e109 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -8,9 +8,7 @@ from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
-from packaging.version import Version
 from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -36,8 +34,6 @@ REQUIRES_V0_MODELS = [
     # V1 Test: no way to fall back for head_dim = 80
     # https://github.com/vllm-project/vllm/issues/14524
     "qwen_vl",
-    "h2ovl",
-    "blip2",
     # V1 Test: not enough KV cache space in C1.
     "fuyu",
 ]
@@ -126,10 +122,8 @@ VLM_TEST_SETTINGS = {
         dtype="bfloat16",
         marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
     ),
-    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
-    # once we upgraded to transformers>=4.49.0.
-    "qwen2_vl": VLMTestInfo(
-        models=["Qwen/Qwen2-VL-2B-Instruct"],
+    "qwen2_5_vl": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
         test_type=(
             VLMTestType.IMAGE,
             VLMTestType.MULTI_IMAGE,
@@ -145,16 +139,16 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
-    "qwen2_5_vl": VLMTestInfo(
-        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+    "qwen2_5_omni": VLMTestInfo(
+        models=["Qwen/Qwen2.5-Omni-7B"],
         test_type=(
             VLMTestType.IMAGE,
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO
         ),
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
@@ -181,8 +175,38 @@ VLM_TEST_SETTINGS = {
         max_tokens=64,
         marks=[large_gpu_mark(min_gb=64)],
     ),
+    "aya_vision": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+    ),
+    "aya_vision-multi_image": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
     "blip2": VLMTestInfo(
-        models=["Salesforce/blip2-opt-2.7b"],
+        # TODO: Change back to 2.7b once head_dim = 80 is supported
+        models=["Salesforce/blip2-opt-6.7b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
@@ -218,12 +242,6 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -275,7 +293,8 @@ VLM_TEST_SETTINGS = {
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
-            "h2oai/h2ovl-mississippi-2b",
+            # TODO: Re-enable once head_dim = 80 is supported
+            # "h2oai/h2ovl-mississippi-2b",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
@@ -316,6 +335,33 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "kimi_vl": VLMTestInfo(
+        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        tensor_parallel_size=1,
+        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
+    "llama4": VLMTestInfo(
+        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|image|>",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        distributed_executor_backend="mp",
+        image_size_factors=[(.25, 0.5, 1.0)],
+        hf_model_kwargs={"device_map": "auto"},
+        max_model_len=8192,
+        max_num_seqs=4,
+        dtype="bfloat16",
+        auto_cls=AutoModelForImageTextToText,
+        tensor_parallel_size=4,
+        marks=multi_gpu_marks(num_gpus=4),
+    ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
@@ -336,6 +382,7 @@ VLM_TEST_SETTINGS = {
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
@@ -365,12 +412,6 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
@@ -385,7 +426,7 @@ VLM_TEST_SETTINGS = {
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -394,21 +435,9 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
     ),
-    "minicpmo_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -417,43 +446,28 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
     ),
-    "minicpmv_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=identity,
         max_model_len=4096,
         max_num_seqs=2,
         patch_hf_runner=model_utils.molmo_patch_hf_runner,
     ),
-    # Tests for phi3v currently live in another file because of a bug in
-    # transformers. Once this issue is fixed, we can enable them here instead.
-    # https://github.com/huggingface/transformers/issues/34307
-    # "phi3v": VLMTestInfo(
-    #     models=["microsoft/Phi-3.5-vision-instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
-    #     img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
-    #     max_model_len=4096,
-    #     max_num_seqs=2,
-    #     task="generate",
-    #     # use eager mode for hf runner since phi3v didn't work with flash_attn
-    #     hf_model_kwargs={"_attn_implementation": "eager"},
-    #     use_tokenizer_eos=True,
-    #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
-    #     num_logprobs=10,
-    # ),
+    "phi3v": VLMTestInfo(
+        models=["microsoft/Phi-3.5-vision-instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        task="generate",
+        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
+        hf_model_kwargs={"_attn_implementation": "sdpa"},
+        use_tokenizer_eos=True,
+        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
+        num_logprobs=10,
+    ),
     "pixtral_hf": VLMTestInfo(
         models=["nm-testing/pixtral-12b-FP8-dynamic"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -474,6 +488,47 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "qwen2_vl": VLMTestInfo(
+        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.cpu_model],
+    ),
+    "skywork_r1v": VLMTestInfo(
+        models=["Skywork/Skywork-R1V-38B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=80)],
+    ),
+    "smolvlm": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
@@ -525,6 +580,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=16384,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
deleted file mode 100644
index 53b183b2735e1..0000000000000
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-from typing import Optional
-
-import pytest
-from transformers import AutoTokenizer
-
-from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-
-models = ["microsoft/Phi-3.5-vision-instruct"]
-
-
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    _, output_str, out_logprobs = vllm_output
-
-    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
-    assert output_str_without_image[0] == " "
-    output_str_without_image = output_str_without_image[1:]
-
-    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    hf_output_ids = tokenizer.encode(output_str_without_image)
-    assert hf_output_ids[0] == 1
-    hf_output_ids = hf_output_ids[1:]
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-target_dtype = "half"
-
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-# Since we use _attn_implementation="eager" for hf_runner, there is more
-# significant numerical difference. The basic `logprobs=5` fails to pass.
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
-                         dtype) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_regresion_7840 = [
-        ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    # Regression test for #7840.
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_regresion_7840,
-        model,
-        dtype=dtype,
-        max_tokens=128,
-        num_logprobs=10,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index c3e88b60978a0..11460a1a8d2b5 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -2,18 +2,22 @@
 
 import os
 import re
+from collections.abc import Sequence
 from typing import Optional
 
+import librosa
 import pytest
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
+from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
 models = [model_path]
 
 
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
     model: str,
     *,
     max_model_len: int,
@@ -104,28 +111,49 @@ def run_test(
             enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
         vllm_outputs_per_case = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
+                                                images=images,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, images, audios in inputs
         ]
 
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
+    hf_model_kwargs = {"_attn_implementation": "sdpa"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        def patch_hf_processor(*args,
+                               text="",
+                               images=None,
+                               audio=None,
+                               sampling_rate=None,
+                               **kwargs):
+            audios = None
+            if audio is not None and sampling_rate is not None:
+                audios = [(audio, sampling_rate)]
+            return hf_processor(*args,
+                                text=text,
+                                images=images,
+                                audios=audios,
+                                **kwargs)
+
+        hf_model.processor = patch_hf_processor
+
         hf_outputs_per_case = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=images,
+                                                    audios=audios,
                                                     eos_token_id=eos_token_id,
                                                     num_logits_to_keep=0)
-            for prompts, images in inputs
+            for prompts, images, audios in inputs
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
@@ -138,8 +166,6 @@ def run_test(
         )
 
 
-# Since we use _attn_implementation="eager" for hf_runner, there is more
-# significant numerical difference. The basic `logprobs=5` fails to pass.
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -151,11 +177,11 @@ def run_test(
         # Single-scale, batched
         [1.0, 1.0, 1.0],
         # Multi-scale
-        [0.7, 0.75, 1.0],
+        [0.25, 0.5, 1.0],
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     inputs_per_image = [(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
+        None,
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     run_test(
@@ -198,20 +225,21 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-@pytest.mark.xfail(
-    reason="Phi-4-MM multi-image inference is divergent with hf model.")
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                              size_factors, dtype: str, max_model_len: int,
                              max_tokens: int, num_logprobs: int) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
     ]
 
     run_test(
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
         mm_limit=2,
         tensor_parallel_size=1,
     )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=None)
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index ee619d8d80c42..6ebe75f0e8129 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,6 +176,8 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -198,22 +200,14 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 11,
-        "length": 494
-    }]),
-     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 11,
-         "length": 266
-     }, {
-         "offset": 277,
-         "length": 1056
-     }, {
-         "offset": 1333,
-         "length": 418
-     }])])
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 2eae643fa2e47..fd046f3cd8e81 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -51,6 +51,10 @@ def run_test(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    limit_mm_per_prompt = default_limits | limit_mm_per_prompt
+
     vllm_outputs_per_mm = []
     hf_outputs_per_mm = []
 
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index c84bf6dc15f42..49305332726e4 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -68,6 +68,17 @@ def qwen2_vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
+def kimiv_vl_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|im_end|>[EOS]"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
 def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
                                   model: str) -> RunnerOutput:
     config = AutoConfig.from_pretrained(model)
@@ -104,6 +115,13 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def llava_onevision_hf_model_kwargs(model: str) -> dict:
+    """Workaround to fix the sliding window issue in llava_onevision."""
+    config = AutoConfig.from_pretrained(model)
+    config.text_config.sliding_window = None
+    return config.to_dict()
+
+
 def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
                                       model: str) -> RunnerOutput:
     """Sanitize vllm output [llava-onevision] to compare with hf output."""
@@ -197,6 +215,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
     return output_ids, output_str, out_logprobs
 
 
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    # Based on Idefics3
+    return idefics3_trunc_hf_output(hf_output, model)
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
@@ -376,6 +400,63 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
+
+    class SkyworkR1VProcessor:
+        """A simple processor for SkyworkR1V."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.skyworkr1v import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_skyworkr1v)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_skyworkr1v(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = SkyworkR1VProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index d6bf7d2706397..87a1dde9381fd 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -57,24 +57,25 @@ def test_find_array(monkeypatch: pytest.MonkeyPatch):
 def server_embedding():
     # GritLM embedding implementation is only supported by XFormers backend.
     args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with pytest.MonkeyPatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest.fixture(scope="module")
 def server_generate():
     args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with pytest.MonkeyPatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client_embedding(monkeypatch: pytest.MonkeyPatch,
-                           server_embedding: RemoteOpenAIServer):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-        async with server_embedding.get_async_client() as async_client:
-            yield async_client
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
 
 
 @pytest_asyncio.fixture
diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/embedding/language/test_jina.py
new file mode 100644
index 0000000000000..881d0a75b1584
--- /dev/null
+++ b/tests/models/embedding/language/test_jina.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""Compare the scoring outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_jina.py`.
+"""
+import math
+
+import pytest
+
+from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
+from vllm import PoolingParams
+
+SCORING_MODELS = [
+    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
+]
+
+TEXTS_1 = ["Organic skincare products for sensitive skin"]
+
+TEXTS_2 = [
+    "Organic skincare for sensitive skin with aloe vera and chamomile.",
+    "New makeup trends focus on bold colors and innovative techniques",
+    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
+    "针对敏感肌专门设计的天然有机护肤产品",
+    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
+    "敏感肌のために特別に設計された天然有機スキンケア製品",
+    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
+]
+
+EMBEDDING_MODELS = [
+    "jinaai/jina-embeddings-v3",
+]
+
+EMBEDDING_PROMPTS = [
+    "Follow the white rabbit.",  # English
+    "Sigue al conejo blanco.",  # Spanish
+    "Suis le lapin blanc.",  # French
+    "跟着白兔走。",  # Chinese
+    "اتبع الأرنب الأبيض.",  # Arabic
+    "Folge dem weißen Kaninchen.",  # German
+]
+
+
+@pytest.fixture(scope="module", params=SCORING_MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 10
+    assert len(hf_outputs) == 10
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+def test_is_matryoshka(vllm_runner, emb_model_name):
+    with vllm_runner(emb_model_name, task="embed",
+                     max_model_len=None) as vllm_model:
+        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embeddings(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dimensions", [16, 32])
+def test_matryoshka(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    dimensions: int,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.encode(
+            example_prompts,
+            pooling_params=PoolingParams(dimensions=dimensions))
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/models/embedding/language/test_snowflake_arctic_embed.py b/tests/models/embedding/language/test_snowflake_arctic_embed.py
new file mode 100644
index 0000000000000..2b884fceec80c
--- /dev/null
+++ b/tests/models/embedding/language/test_snowflake_arctic_embed.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the embedding outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
+"""
+import pytest
+
+from tests.models.embedding.utils import EmbedModelInfo
+
+from ..utils import check_embeddings_close
+
+EMBEDDING_PROMPTS = [
+    'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
+    'Mexico City of Course!'
+]
+
+MODELS = [
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                   is_matryoshka=False,
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                   is_matryoshka=True,
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                   is_matryoshka=True,
+                   architecture="GteModel",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_info: EmbedModelInfo,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    example_prompts = example_prompts + EMBEDDING_PROMPTS
+
+    vllm_extra_kwargs = {
+        "hf_overrides": {
+            "is_matryoshka": model_info.is_matryoshka
+        }
+    }
+
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        assert (vllm_model.model.llm_engine.model_config.is_matryoshka ==
+                model_info.is_matryoshka)
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.model.llm_engine.model_config.architectures)
+
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index bef85eaf372f1..a58116e2bf0d7 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Sequence
+from typing import NamedTuple
 
 import torch
 import torch.nn.functional as F
@@ -30,3 +31,17 @@ def check_embeddings_close(
                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
         assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor, dimensions):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+    architecture: str = ""
+    enable_test: bool = True
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 9cc767c23b26c..f9985bd8a2e89 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -2,6 +2,10 @@
 
 import pytest
 import torch.nn.functional as F
+from PIL import Image
+
+from vllm.assets.base import get_vllm_public_assets
+from vllm.assets.image import VLM_IMAGES_DIR
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@@ -112,6 +116,15 @@ def test_models_image(
         (text, asset.pil_image)
         for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
+    # add cases for special_tokens
+    input_texts_images.append((
+        "\n<s><|user|>\n <|image_1|>\n\t <s>"
+        "Represent the given image for classification<|end|>"
+        "\n<|assistant|>\n",
+        Image.open(
+            get_vllm_public_assets(filename="cherry_blossom.jpg",
+                                   s3_prefix=VLM_IMAGES_DIR)),
+    ))
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index a6ec333e2e9b4..14b64393bf52a 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -13,12 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
 
 MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-# Therefore, we borrow the BartTokenizer from the original Bart model
-TOKENIZER = "facebook/bart-base"
+# Florence-2 model repo's tokenizer config is missing some special tokens.
+# Therefore, we use a converted tokenizer from a forked repo
+TOKENIZER = "Isotr0py/Florence-2-tokenizer"
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    "<CAPTION>",  # special task token
+    "<OD>",  # special task token which will output special tokens
     "cherry_blossom":
     "Describe in detail what is shown in the image.",
 })
@@ -45,7 +45,6 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str,
     output_ids, output_str, out_logprobs = hf_output
 
     output_str = output_str.replace("</s>", "").replace("<s>", "")
-    output_ids = [ids for ids in output_ids if ids not in [0, 2]]
 
     return output_ids, output_str, out_logprobs
 
@@ -71,8 +70,11 @@ def run_test(
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_case = [
             vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs)
-            for prompts in inputs
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                skip_special_tokens=False,
+            ) for prompts in inputs
         ]
 
     hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
@@ -93,6 +95,7 @@ def run_test(
             outputs_1_lst=vllm_outputs,
             name_0="hf",
             name_1="vllm",
+            num_outputs_0_skip_tokens=1,
         )
 
 
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index ae7a7b028b152..d94c2e885cb6e 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -209,14 +209,15 @@ def _run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
-                                          }) as vllm_model:
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=19212,  # 3 max size images
+            max_num_seqs=3,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -422,10 +423,9 @@ def test_bnb_regression(
     llm = LLM(
         model=model,
         dtype=dtype,
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         quantization="bitsandbytes",
-        load_format="bitsandbytes",
     )
     sampling_params = SamplingParams(
         temperature=0,
@@ -476,7 +476,7 @@ def test_explicit_implicit_prompt(
     llm = LLM(
         model=model,
         dtype=dtype,
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         tensor_parallel_size=1,
     )
@@ -507,8 +507,8 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
     with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
             model,
             dtype=dtype,
-            max_model_len=4096,
-            max_num_seqs=2,
+            max_model_len=8192,
+            max_num_seqs=4,
             tensor_parallel_size=1,
             limit_mm_per_prompt={"image":
                                  _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
@@ -553,6 +553,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
                                             num_logprobs,
                                             images=images)
 
+        # Mixed batch with text and images with different numbers of tiles
+        prompts = [
+            "<|begin_of_text|>Hello!",
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+            # smaller image must be 2nd for the repro
+            [stop_sign.resize((448, 448))],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
 
 class DummyModel:
     image_token_id = MLLAMA_IMAGE_TOKEN_ID
@@ -675,3 +692,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
                 f"full_text_row_masked_out_mask[{idx}] must be " \
                 f"'{must_be_masked}' "
             idx += 1
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
+    ([6404], [[4]], [6404]),
+    ([0, 6404], [[4]], [6404]),
+    ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
+    ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
+])
+def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
+                                         expected) -> None:
+
+    dummy = DummyModel()
+    num_tokens_per_tile = 1601
+    actual_encoder_seq_lens = MllamaForConditionalGeneration \
+        ._get_and_validate_encoder_lens(
+            dummy,
+            encoder_seq_lens,
+            num_tiles,
+            num_tokens_per_tile,
+        )
+    assert actual_encoder_seq_lens == expected, \
+        f"Expected {expected} but got {actual_encoder_seq_lens}"
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 078ed21537b8d..d56638f051f24 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
 # yapf: disable
 @pytest.mark.parametrize("model_id", [
     "rhymes-ai/Aria",
+    "CohereForAI/aya-vision-8b",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
@@ -256,28 +257,34 @@ def _test_processing_correctness_mistral(
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "moonshotai/Kimi-VL-A3B-Instruct",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
-    "mistralai/Pixtral-12B-2409",
-    "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
     "allenai/Molmo-7B-D-0924",
     "allenai/Molmo-7B-O-0924",
     "nvidia/NVLM-D-72B",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-4-multimodal-instruct",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2.5-Omni-7B",
+    "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
-    "google/paligemma-3b-mix-224",
-    "google/paligemma2-3b-ft-docci-448",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 713fc733e21c6..709a686577f34 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -10,7 +10,6 @@ from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -156,11 +155,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index fdbe2f17692f7..f5b5cf6b5ba96 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -4,7 +4,6 @@ import pytest
 from transformers import Idefics3Config
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -29,7 +28,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_idefics3 handles num_crops properly."""
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
@@ -38,11 +37,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index f5bd661071ac6..5ac47ecc5cc17 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -10,7 +10,6 @@ from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -113,11 +112,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
new file mode 100644
index 0000000000000..2bfc2785feb6f
--- /dev/null
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for Llama4's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.tokenizer import encode_tokens
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("mm_processor_kwargs", [{}])
+@pytest.mark.parametrize("num_imgs", [1, 5])
+@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
+@pytest.mark.parametrize("tokenized_prompt", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict,
+    num_imgs: int,
+    disable_mm_preprocessor_cache: bool,
+    tokenized_prompt: bool,
+):
+    """Ensure llama4 processor works properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": num_imgs},
+        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    config = processor.info.get_hf_config()
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor = processor.info.get_hf_processor()
+    vocab = tokenizer.get_vocab()
+
+    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+        + "<|image|>" * num_imgs \
+        + "<|eot|><|header_start|>assistant<|header_end|>"
+    mm_data = {
+        "image": [
+            image_assets[(i % len(image_assets))].pil_image
+            for i in range(num_imgs)
+        ]
+    }
+    if tokenized_prompt:
+        prompt = encode_tokens(tokenizer, prompt)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    mm_kwargs = processed_inputs["mm_kwargs"]
+
+    # place holder replacements
+    prompt_token_ids = processed_inputs["prompt_token_ids"]
+    assert prompt_token_ids.count(config.boi_token_index) == num_imgs
+    assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
+    assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
+    aspect_ratios = mm_kwargs["aspect_ratios"]
+    num_x_separators = num_y_separators = 0
+    for tiles_y, tiles_x in aspect_ratios:
+        if tiles_x * tiles_y > 1:
+            num_x_separators += (tiles_x - 1) * tiles_y
+            num_y_separators += tiles_y
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
+        == num_x_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
+        ==  num_y_separators
+
+    # image token offsets
+    img_locs = processed_inputs["mm_placeholders"].get("image", [])
+    assert len(img_locs) == num_imgs
+    assert [img_loc.offset for img_loc in img_locs] == \
+        [i for i, v in enumerate(prompt_token_ids) \
+        if v == config.boi_token_index]
+
+    # patch sizes and masks
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(
+        config.vision_config)
+    assert prompt_token_ids.count(config.image_token_index) \
+        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
+    assert mm_kwargs["pixel_values"].shape[0] \
+        == mm_kwargs["patches_per_image"].sum()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 74bca0e358996..b82bfe483dbbc 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -40,10 +39,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -96,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder["offset"] == 1
-        assert first_placeholder["length"] == (
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -168,10 +161,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index c27898a40b711..dcc8dc8dab5a0 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -41,10 +40,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -96,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder["offset"] == 0
-        assert first_placeholder["length"] == len(
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -169,10 +162,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
new file mode 100644
index 0000000000000..b89376cf17229
--- /dev/null
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from transformers import MllamaConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
+def test_profiling(
+    model_id: str,
+    max_model_len: int,
+    max_num_seqs: int,
+):
+    # regression test for https://github.com/vllm-project/vllm/issues/13929
+    from vllm.model_executor.models.mllama import calc_token_per_chunk
+
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    dummy_encoder_data = profiler.get_encoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(MllamaConfig)
+    image_size = hf_config.vision_config.image_size
+    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
+                        ] * max_num_seqs
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt_text,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    # Get the actual number of encoder tokens for each sample.
+    # Because attn_metadata.encoder_seq_lens only counts the last
+    # group of images for each sample, which is used to cheat the
+    # block manager to allocate blocks for those images only.
+    # See MllamaMultiModalProcessor for more details.
+    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tokens_per_tile = calc_token_per_chunk(image_size)
+    actual_encoder_seq_lens = [
+        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+    ]
+
+    # simulate mllama image-present prefill.
+    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                          encoder_seq_lens):
+        assert actual_len >= last_group_len
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 2f0c8e7e5492c..ed0d04c5c5f5d 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -30,7 +29,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_phi3v handles num_crops properly."""
+    """Ensure Phi3VMultiModalProcessor handles num_crops properly."""
     # Avoid initializing CUDA early
     from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
@@ -39,11 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
new file mode 100644
index 0000000000000..797986adba4af
--- /dev/null
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 95204c7ebb4d8..d8c2ca414d41c 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -34,11 +33,8 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
new file mode 100644
index 0000000000000..56edc58a71baa
--- /dev/null
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for smolvlm's multimodal preprocessing kwargs."""
+import pytest
+from transformers import SmolVLMConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"max_image_size": {"longest_edge": 384}}, 1377),
+        ({"max_image_size": {"longest_edge": 768}}, 405),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5c84e85aaa907..c15ae36198449 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -34,6 +34,16 @@ class _HfExamplesInfo:
     The minimum version of HF Transformers that is required to run this model.
     """
 
+    max_transformers_version: Optional[str] = None
+    """
+    The maximum version of HF Transformers that this model runs on.
+    """
+
+    transformers_version_reason: Optional[str] = None
+    """
+    The reason for the minimum/maximum version requirement.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
@@ -57,21 +67,28 @@ class _HfExamplesInfo:
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
-        if self.min_transformers_version is None:
+        if (self.min_transformers_version is None
+                and self.max_transformers_version is None):
             return
 
         current_version = TRANSFORMERS_VERSION
-        required_version = self.min_transformers_version
-        if Version(current_version) < Version(required_version):
-            msg = (
-                f"You have `transformers=={current_version}` installed, but "
-                f"`transformers>={required_version}` is required to run this "
-                "model")
+        min_version = self.min_transformers_version
+        max_version = self.max_transformers_version
+        msg = f"`transformers=={current_version}` installed, but `transformers"
+        if min_version and Version(current_version) < Version(min_version):
+            msg += f">={min_version}` is required to run this model."
+        elif max_version and Version(current_version) > Version(max_version):
+            msg += f"<={max_version}` is required to run this model."
+        else:
+            return
 
-            if on_fail == "error":
-                raise RuntimeError(msg)
-            else:
-                pytest.skip(msg)
+        if self.transformers_version_reason:
+            msg += f" Reason: {self.transformers_version_reason}"
+
+        if on_fail == "error":
+            raise RuntimeError(msg)
+        else:
+            pytest.skip(msg)
 
     def check_available_online(
         self,
@@ -107,12 +124,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
     "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
                                     trust_remote_code=True),
+    "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
+                                                       trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
                                          trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
-    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
+    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
                                          trust_remote_code=True),
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
@@ -127,6 +146,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
                                          min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
+    "Glm4ForCausalLM": _HfExamplesInfo(
+        "THUDM/GLM-4-32B-0414",
+        is_available_online=False,
+        min_transformers_version="4.52.dev0"
+    ),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
     "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
@@ -159,6 +183,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
+    "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
+                                                trust_remote_code=True),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
     "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
@@ -178,11 +204,23 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
+    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                        trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
                                         extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
+    "Qwen3ForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-8B",
+        is_available_online=False,
+        min_transformers_version="4.51"
+    ),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-MoE-15B-A2B",
+        is_available_online=False,
+        min_transformers_version="4.51"
+    ),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                      is_available_online=False),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
@@ -209,11 +247,15 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                                               trust_remote_code=True),
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
+                                               trust_remote_code=True),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
@@ -237,14 +279,22 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
+    "ModernBertForSequenceClassification":
+        _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base",
+                        min_transformers_version="4.49"),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
+    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
+                                                max_transformers_version="4.48",  # noqa: E501
+                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
@@ -253,12 +303,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
+                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
+                                                      trust_remote_code=True),
+    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
+                                                      min_transformers_version="4.51"),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
@@ -266,13 +323,22 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      max_transformers_version="4.48",  # noqa: E501
+                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                max_transformers_version="4.48",
+                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
+    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
+                                                        min_transformers_version="4.50",  # noqa: E501
+                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Incorrectly-detected `tensorflow` import.",  # noqa: E501
                                         extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
@@ -281,7 +347,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True,
-                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
@@ -294,13 +362,17 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B",  # noqa: E501
+                                                                  min_transformers_version="4.52"),  # noqa: E501
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
                                                          trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
@@ -316,10 +388,14 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                         speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                         trust_remote_code=True),
+    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             trust_remote_code=True,
+                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
 }
 
-_FALLBACK_MODEL = {
-    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+_TRANSFORMERS_MODELS = {
+    "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 
 _EXAMPLE_MODELS = {
@@ -328,7 +404,7 @@ _EXAMPLE_MODELS = {
     **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 
diff --git a/tests/models/test_bitblas.py b/tests/models/test_bitblas.py
new file mode 100644
index 0000000000000..ae4a52214ad0c
--- /dev/null
+++ b/tests/models/test_bitblas.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_bitblas,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
diff --git a/tests/models/test_gptq_bitblas.py b/tests/models/test_gptq_bitblas.py
new file mode 100644
index 0000000000000..d28442120ea69
--- /dev/null
+++ b/tests/models/test_gptq_bitblas.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_gptq,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index adb2d6d0a9907..cd2b8f00d521b 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,6 +7,8 @@ from transformers import PretrainedConfig
 
 from vllm import LLM
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from .registry import HF_EXAMPLE_MODELS
@@ -42,20 +44,29 @@ def test_can_initialize(model_arch):
         self.cache_config.num_gpu_blocks = 0
         self.cache_config.num_cpu_blocks = 0
 
-    def _initalize_kv_caches_v1(self, vllm_config):
-        # gpu_blocks (> 0), cpu_blocks
-        return 1, 0
+    def _initialize_kv_caches_v1(self, vllm_config):
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+        scheduler_kv_cache_config = get_kv_cache_config(
+            vllm_config,
+            kv_cache_specs[0],
+            20 * GiB_bytes,
+        )
+
+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
+        return 1, 0, scheduler_kv_cache_config
 
     with (patch.object(V0LLMEngine, "_initialize_kv_caches",
                        _initialize_kv_caches_v0),
           patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initalize_kv_caches_v1)):
+                       _initialize_kv_caches_v1)):
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
-            speculative_model=model_info.speculative_model,
-            num_speculative_tokens=1 if model_info.speculative_model else None,
+            speculative_config={
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             load_format="dummy",
             hf_overrides=hf_overrides,
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index e6141b97b10dc..f1ed8a04cfa08 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -90,6 +90,7 @@ def test_oot_registration_multimodal(
                   max_model_len=4096,
                   enforce_eager=True,
                   limit_mm_per_prompt={"image": 1})
+
         first_token = llm.get_tokenizer().decode(0)
         outputs = llm.generate(prompts, sampling_params)
 
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 243cb92ae2569..65bb11d6b5e4e 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -3,8 +3,6 @@
 
 Run `pytest tests/models/test_transformers.py`.
 """
-from contextlib import nullcontext
-
 import pytest
 
 from ..conftest import HfRunner, VllmRunner
@@ -42,7 +40,6 @@ def check_implementation(
     "model,model_impl",
     [
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
-        ("openai-community/gpt2", "transformers"),
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
     ])  # trust_remote_code=True by default
 def test_models(
@@ -52,20 +49,11 @@ def test_models(
     model: str,
     model_impl: str,
 ) -> None:
-
-    maybe_raises = nullcontext()
-    if model == "openai-community/gpt2" and model_impl == "transformers":
-        # Model is not backend compatible
-        maybe_raises = pytest.raises(
-            ValueError,
-            match="The Transformers implementation.*not compatible with vLLM")
-
-    with maybe_raises:
-        check_implementation(hf_runner,
-                             vllm_runner,
-                             example_prompts,
-                             model,
-                             model_impl=model_impl)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         model,
+                         model_impl=model_impl)
 
 
 @multi_gpu_test(num_gpus=2)
@@ -84,7 +72,6 @@ def test_distributed(
         "meta-llama/Llama-3.2-1B-Instruct",
         {
             "quantization": "bitsandbytes",
-            "load_format": "bitsandbytes",
         },
     ),
 ])
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
new file mode 100644
index 0000000000000..d61c7d2d50000
--- /dev/null
+++ b/tests/models/test_utils.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.model_executor.models.utils import AutoWeightsLoader
+
+
+class ModuleWithBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bn = torch.nn.BatchNorm1d(2)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
+class ModuleWithNestedBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.nested_mod = ModuleWithBatchNorm()
+
+    def forward(self, x):
+        return self.nested_mod(x)
+
+
+def test_module_with_batchnorm_can_load():
+    """Ensure the auto weight loader can load batchnorm stats."""
+    mod = ModuleWithBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithBatchNorm()
+
+    assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_with_child_containing_batchnorm_can_autoload():
+    """Ensure the auto weight loader can load nested modules batchnorm stats."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 7109169e89966..5407540114b4c 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -255,6 +255,7 @@ def build_model_context(
     model_id: str,
     task: TaskOption = "auto",
     dtype: Union[str, torch.dtype] = "auto",
+    model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
     limit_mm_per_prompt: Optional[dict[str, int]] = None,
     disable_mm_preprocessor_cache: bool = True,
@@ -274,6 +275,7 @@ def build_model_context(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    model_config_kwargs = model_config_kwargs or {}
     model_config = ModelConfig(
         model_id,
         task=task,
@@ -286,5 +288,6 @@ def build_model_context(
         limit_mm_per_prompt=limit_mm_per_prompt,
         disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
         hf_overrides=model_info.hf_overrides,
+        **model_config_kwargs,
     )
     return InputContext(model_config)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b229f1e6ec8da..59f7bf8fab2fe 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
 from .utils import random_image
@@ -786,6 +785,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -794,6 +794,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -808,12 +809,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -822,6 +825,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -836,12 +840,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -850,6 +856,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -858,6 +865,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -955,10 +963,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
@@ -967,10 +972,13 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     if is_valid:
         exc_ctx = nullcontext()
     else:
-        exc_ctx = pytest.raises(ValueError, match="this model only supports")
+        exc_ctx = pytest.raises(ValueError, match="The model only supports")
 
     with exc_ctx:
-        profiler.get_decoder_dummy_data(model_config.max_model_len)
+        profiler.get_decoder_dummy_data(
+            model_config.max_model_len,
+            mm_counts=limit_mm_per_prompt,
+        )
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -994,10 +1002,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1066,10 +1071,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
         revision=None,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     orig_get_hf_processor = processor.info.get_hf_processor
 
     def get_hf_processor(self, **kwargs):
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 8f76d895fdd29..ce1429fda9439 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,12 +9,10 @@ from typing import TYPE_CHECKING, NamedTuple, Optional
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
-from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
-                                   repeat_and_pad_placeholder_tokens)
+                                   merge_and_sort_multimodal_metadata)
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -136,71 +134,6 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-def test_repeat_and_pad_placeholder_tokens(model):
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-
-    test_cases = [
-        (
-            "<image>",
-            2,
-            "<image><image>",
-            [32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            2,
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            [3, 2],
-            "<image><image><image><image><image>",
-            [32000, 32000, 32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
-        ),
-        (
-            "Image:<image>Image:<image>!",
-            [3, 2],
-            "Image:<image><image><image>Image:<image><image>!",
-            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
-            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
-        ),
-        (
-            "<image>",
-            [3, 2],
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }],
-        ),
-    ]  # yapf: disable
-
-    for (
-            prompt,
-            repeat_count,
-            expected_prompt,
-            expected_token_ids,
-            expected_ranges,
-    ) in test_cases:
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer=tokenizer,
-            prompt=prompt,
-            prompt_token_ids=tokenizer.encode(prompt,
-                                              add_special_tokens=False),
-            placeholder_token_id=image_token_id,
-            repeat_count=repeat_count,
-        )
-        assert new_prompt == expected_prompt
-        assert new_token_ids == expected_token_ids
-        assert ranges == expected_ranges
-
-
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
@@ -222,7 +155,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=2),
@@ -239,7 +172,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=2),
@@ -264,7 +197,7 @@ def test_merge_and_sort_multimodal_metadata():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -290,7 +223,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -321,7 +254,9 @@ def test_merge_and_sort_multimodal_metadata():
                 "audio": ["audio_hash1"],
                 "video": ["video_hash1", "video_hash2", "video_hash3"]
             },
-            expected_modalities=["audio", "video", "image"],
+            expected_modalities=[
+                "audio", "video", "video", "video", "image", "image"
+            ],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=4),
@@ -367,12 +302,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=[],
-            expected_ranges=[],
-            expected_hashes=None,
+            expected_modalities=["image", "audio", "image", "audio"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=2),
+                PlaceholderRange(offset=11, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            ],
         ),
 
-        # <image> <image> <video> <audio> <image>
+        # <image> <image> <audio> <video> <image>
         TestCase(
             mm_positions={
                 "image": [
@@ -388,15 +330,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=[],
-            expected_ranges=[],
+            expected_modalities=["image", "image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=5),
+                PlaceholderRange(offset=20, length=4),
+            ],
             expected_hashes=None,
         ),
+
+        # <image> <audio> <video> <image> with hashes
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=18, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=6, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=10, length=5),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1"],
+            },
+            expected_modalities=["image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=6, length=2),
+                PlaceholderRange(offset=10, length=5),
+                PlaceholderRange(offset=18, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            ],
+        ),
     ]
 
-    for case in test_cases:
-        with pytest.raises(ValueError) as ex_info:
-            merge_and_sort_multimodal_metadata(case.mm_positions,
-                                               case.mm_hashes)
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
 
-        assert "Interleaved mixed-modality" in str(ex_info.value)
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index ea33727b7cfae..3d869cd2fa17f 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
     key_cache = torch.zeros_like(key_cache_cpu, device=device)
     value_cache = torch.zeros_like(value_cache_cpu, device=device)
     slot_mapping = slot_mapping_cpu.to(device)
+    kv_cache = torch.stack([key_cache, value_cache])
 
     # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+    reshape_and_cache(key, value, kv_cache, slot_mapping)
+    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
 
     # Move results back to CPU for comparison
     key_cache_result = key_cache.cpu()
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 5a811f6defe6c..8f7e711b525e3 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -258,13 +258,13 @@ def sample_inputs(
                              value[start_loc:end_loc])
             cur_ctx += block_size
             block_id += 1
+    kv_cache = torch.stack([k_cache, v_cache])
 
     return (
         query,
         k,
         v,
-        k_cache,
-        v_cache,
+        kv_cache,
         block_table,
         key,
         value,
@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
             query,
             k_active,
             v_active,
-            k_cache,
-            v_cache,
+            kv_cache,
             block_table,
             key,
             value,
@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
         query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
 
         # transform block table
         active_block_table = get_active_block_tables(
@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
             query.to(device=device),
             k.to(device=device),
             v.to(device=device),
-            k_cache.to(device=device),
-            v_cache.to(device=device),
+            kv_cache.to(device=device),
             active_block_table.to(device=device),
             attn_mask.to(device=device),
         )
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 1b6a918401487..8d9ae282153cf 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -41,7 +41,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 
     hf_model_kwargs = {"load_in_4bit": True}
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, hf_model_kwargs)
+                             model_name, False, hf_model_kwargs)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -53,7 +53,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                        model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -65,7 +65,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -82,6 +82,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              vllm_runner,
                              example_prompts[:1],
                              model_name,
+                             False,
                              hf_model_kwargs,
                              vllm_tp_size=2)
 
@@ -101,8 +102,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
         "--enable-prefix-caching",
         "--quantization",
         "bitsandbytes",
-        "--load-format",
-        "bitsandbytes",
         "--gpu-memory-utilization",
         "0.7",
     ]
@@ -130,14 +129,14 @@ def validate_generated_texts(hf_runner,
                              vllm_runner,
                              prompts,
                              model_name,
+                             pre_quant=False,
                              hf_model_kwargs=None,
                              vllm_tp_size=1):
 
     # NOTE: run vLLM first, as it requires a clean process
     # when using distributed inference
     with vllm_runner(model_name,
-                     quantization='bitsandbytes',
-                     load_format='bitsandbytes',
+                     quantization=None if pre_quant else 'bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
                      enforce_eager=False) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 133475a3e06aa..70f716f95e896 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -20,6 +20,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
+# AITER only supports per-channel-per-channel INT8 gemm
+# and per-tensor-per-tensor INT8 GEMM.
+# It does not support mix precision MM and mix quantization scheme.
+ROCM_AITER_SUPPORTED_INT8_MODEL = [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+]
+
+# TritonScaledMMLinearKernel only supports symmetric quantization.
+ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+]
+
 
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
@@ -57,6 +74,11 @@ def use_v0_only(monkeypatch):
 )
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -123,6 +145,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
 )
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_compressed_tensors_w8a8_logprobs(
     hf_runner,
     vllm_runner,
@@ -130,7 +154,21 @@ def test_compressed_tensors_w8a8_logprobs(
     model_path,
     max_tokens,
     num_logprobs,
+    use_aiter,
+    monkeypatch,
 ):
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
@@ -154,6 +192,9 @@ def test_compressed_tensors_w8a8_logprobs(
         name_1="vllm",
     )
 
+    if current_platform.is_rocm():
+        torch.cuda.synchronize()
+
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
@@ -177,8 +218,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
         ),
     ],
 )
-def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_dynamic_per_token(
+    vllm_runner,
+    model_args,
+    use_aiter,
+    monkeypatch,
+):
     model_path, strategy = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model_path, dtype=torch.float16) as llm:
 
         def check_model(model):
@@ -201,14 +261,23 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [
-        ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
-        ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
-        ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
-    ],
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
+      True, False),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
+      False),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
+      True, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
+      8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
+      "channel", None, 8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+      "group", 128, 8, False, True)],
 )
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
-    model, strategy, group, pack_factor = wNa16_args
+    model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
     with vllm_runner(model) as llm:
 
         def check_model(model):
@@ -224,6 +293,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
                                                   if group is None else group)
 
             assert qkv_proj.scheme.pack_factor == pack_factor
+            assert qkv_proj.scheme.symmetric == symmetric
+            assert qkv_proj.scheme.has_g_idx == has_g_idx
 
         llm.apply_model(check_model)
 
@@ -231,6 +302,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
@@ -271,7 +344,7 @@ def test_compressed_tensors_fp8(vllm_runner):
 
             if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
                 assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                 assert qkv_proj.weight_scale.dtype is torch.float32
                 assert len(qkv_proj.weight_scale.shape) == 0
 
@@ -281,6 +354,8 @@ def test_compressed_tensors_fp8(vllm_runner):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -309,7 +384,8 @@ def _test_2of4_quant_models(qkv_proj,
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -356,7 +432,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 79afcc916f2bb..a05eb494c11a7 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,13 +10,6 @@ from tests.quantization.utils import is_quant_method_supported
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    # Fall back to V0 if cpu offloading is enabled.
-    # Fixture is required to that baseline uses V0.
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
@@ -33,7 +26,9 @@ def test_cpu_offload_fp8():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test GPTQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"],
@@ -47,7 +42,9 @@ def test_cpu_offload_gptq():
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                     reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test AWQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                          ["--cpu-offload-gb", "1"],
@@ -61,7 +58,9 @@ def test_cpu_offload_awq():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test wNa16
     compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
                          ["--cpu-offload-gb", "1"],
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 19cf29d3e6591..e74e14a0dcb64 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -23,8 +23,14 @@ MODELS = [
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            monkeypatch) -> None:
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
@@ -47,7 +53,13 @@ KV_CACHE_MODELS = [
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
@@ -86,8 +98,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         monkeypatch) -> None:
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
 
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 85dc695be6865..ce918a3248872 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -4,17 +4,28 @@
 Run `pytest tests/quantization/test_quark.py`.
 """
 
-import torch
+import pytest
 
 from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
-    QuarkLinearMethod, QuarkW8A8Fp8)
+    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.platforms import current_platform
 
 
-def test_quark_fp8(vllm_runner, monkeypatch):
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
+@pytest.mark.parametrize('tp', [1])
+def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path,
+                     kv_cache_dtype=kv_cache_dtype,
+                     tensor_parallel_size=tp) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -26,11 +37,29 @@ def test_quark_fp8(vllm_runner, monkeypatch):
 
             if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
                 assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-                #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                 assert len(qkv_proj.weight_scale.shape) == 0
 
         llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
+
+
+@pytest.mark.parametrize('tp', [1])
+def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
+    model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
+    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
new file mode 100644
index 0000000000000..314ec90e34f93
--- /dev/null
+++ b/tests/quantization/test_torchao.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.metadata
+import importlib.util
+
+import pytest
+
+DTYPE = ["bfloat16"]
+
+TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_pre_quantized_model(vllm_runner):
+    with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+    assert output
+    print(output)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/lora/data/__init__.py b/tests/reasoning/__init__.py
similarity index 100%
rename from tests/lora/data/__init__.py
rename to tests/reasoning/__init__.py
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
similarity index 59%
rename from tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
rename to tests/reasoning/test_deepseekr1_reasoning_parser.py
index 5ce5d9280f3ef..1b669c8fd2fb9 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -3,74 +3,126 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.entrypoints.openai.reasoning_parsers.utils import (
-    run_reasoning_extraction)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "deepseek_r1"
 start_token = "<think>"
 end_token = "</think>"
 
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
 SIMPLE_REASONING = {
     "output": "This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
     "reasoning_content": "This is content",
     "content": None,
+    "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "<think>This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+THINK_NO_END = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support <think>...</think> and </think>...
+# We cannot know if the text before <think> is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
 }
 
 TEST_CASES = [
@@ -164,25 +216,53 @@ TEST_CASES = [
         SHORTEST_REASONING_WITH_THINK,
         id="shortest_with_think_streaming",
     ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
 ]
 
-# Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-tokenizer.add_tokens([start_token, end_token])
-
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
     streaming: bool,
     param_dict: dict,
+    deepseek_r1_qwen_tokenizer,
 ):
-    output = tokenizer.tokenize(param_dict["output"])
+    output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: list[str] = [
-        tokenizer.convert_tokens_to_string([token]) for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
+        for token in output
     ]
     parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+        parser_name)(deepseek_r1_qwen_tokenizer)
 
     reasoning, content = run_reasoning_extraction(parser,
                                                   output_tokens,
@@ -190,3 +270,17 @@ def test_reasoning(
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
new file mode 100644
index 0000000000000..48fb8c2f8d1b9
--- /dev/null
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -0,0 +1,347 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "granite"
+START_REASONING = "Here is my thought process:"
+START_RESPONSE = "Here is my response:"
+
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+
+# Additional tests for verifying the correctness of granite streaming; this
+# is complicated because granite uses multiple tokens to indicate when thinking
+# is starting / when it's starting its response, so skipping special tokens
+# is awkward.
+
+### Handling the start of reasoning
+STREAMING_1 = {
+    "previous_text": None,
+    "current_text": "Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# When we fail, we should give what was previously being silenced first
+STREAMING_2 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought failure",
+    "delta_text": " failure",
+    "reasoning_content": None,
+    "content": "Here is my thought failure",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_3 = {
+    "previous_text": "Here wrong",
+    "current_text": " words",
+    "delta_text": " Here wrong words",
+    "reasoning_content": None,
+    "content": " words",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_4 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought process:",
+    "delta_text": " process:",
+    "reasoning_content": None,
+    "content": None,
+}
+# Reasoning started successfully; parse reasoning content
+STREAMING_5 = {
+    "previous_text": "Here is my thought process:",
+    "current_text": "Here is my thought process: foo",
+    "delta_text": " foo",
+    "reasoning_content": " foo",
+    "content": None,
+}
+# Response special sequence has started, but not finished.
+STREAMING_6 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text": "Here is my thought process: foo Here is",
+    "delta_text": " Here is",
+    "reasoning_content": " ",
+    "content": None,
+}
+# Response special sequence started, but was broken; the reasoning
+# content should be the content that was previously unused.
+STREAMING_7 = {
+    "previous_text": "Here is my thought process: foo Here is",
+    "current_text": "Here is my thought process: foo Here is Here",
+    "delta_text": " Here",
+    "reasoning_content": "Here is ",
+    "content": None,
+}
+# Response special sequence is ongoing
+STREAMING_8 = {
+    "previous_text": "Here is my thought process: foo Here is my response:",
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": " bar",
+    "reasoning_content": None,
+    "content": " bar",
+}
+# The delta text has everything; we should be able to correctly parse both
+STREAMING_9 = {
+    "previous_text": None,
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": "Here is my thought process: foo Here is my response: bar",
+    "reasoning_content": " foo ",
+    "content": " bar",
+}
+## The Response is ongoing, and the delta mixes reasoning content / content
+STREAMING_10 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text":
+    "Here is my thought process: foo bar Here is my response: baz",
+    "delta_text": " bar Here is my response: baz",
+    "reasoning_content": " bar ",
+    "content": " baz",
+}
+# The delta text starts a new substring that might be a response special seq
+STREAMING_11 = {
+    "previous_text":
+    "Here is my thought process: This is a reasoning section ",
+    "current_text":
+    "Here is my thought process: This is a reasoning section Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# The delta text is finishing the response special seq
+STREAMING_12 = {
+    "previous_text": "Here is my thought process: foo Here is my response",
+    "current_text": "Here is my thought process: foo Here is my response:",
+    "delta_text": ":",
+    "reasoning_content": None,
+    "content": None,
+}
+STREAMING_13 = {
+    "previous_text": "Here is my thought process: foo Here",
+    "current_text": "Here is my thought process: foo Here was",
+    "delta_text": " was",
+    "reasoning_content": "Here was",
+    "content": None,
+}
+
+STREAMING_SUBCASES = [
+    pytest.param(
+        STREAMING_1,
+        id="Starting reasoning special sequence",
+    ),
+    pytest.param(
+        STREAMING_2,
+        id="Unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_3,
+        id="Continuing unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_4,
+        id="Only start reasoning sequence and nothing else",
+    ),
+    pytest.param(
+        STREAMING_5,
+        id="Reasoning content has started",
+    ),
+    pytest.param(
+        STREAMING_6,
+        id="Response special sequence has started",
+    ),
+    pytest.param(
+        STREAMING_7,
+        id="Response special sequence reset",
+    ),
+    pytest.param(
+        STREAMING_8,
+        id="Response text has started",
+    ),
+    pytest.param(
+        STREAMING_9,
+        id="Delta contains everything",
+    ),
+    pytest.param(
+        STREAMING_10,
+        id="Delta contains some reasoning and response",
+    ),
+    pytest.param(
+        STREAMING_11,
+        id="Delta starts response sequence",
+    ),
+    pytest.param(
+        STREAMING_12,
+        id="Delta finishes response sequence",
+    ),
+    pytest.param(
+        STREAMING_13,
+        id="Delta breaks potential responise sequence",
+    ),
+]
+
+
+@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
+def test_streaming_subcases(param_dict):
+    # Get all of the token IDs
+    previous_token_ids = tokenizer.encode(
+        param_dict["previous_text"]
+    ) if param_dict["previous_text"] is not None else []
+    current_token_ids = tokenizer.encode(param_dict["current_text"])
+    delta_token_ids = tokenizer.encode(param_dict["delta_text"])
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    response = parser.extract_reasoning_content_streaming(
+        previous_text=param_dict["previous_text"],
+        current_text=param_dict["current_text"],
+        delta_text=param_dict["delta_text"],
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+    )
+    # Streaming currently expects at least one of reasoning content / content,
+    # so the response should return None in that case.
+    if param_dict["reasoning_content"] is None and param_dict[
+            "content"] is None:
+        assert response is None
+    else:
+        assert isinstance(response, DeltaMessage)
+        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["content"] == response.content
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/reasoning/utils.py
similarity index 97%
rename from tests/entrypoints/openai/reasoning_parsers/utils.py
rename to tests/reasoning/utils.py
index 01e43130bc6e7..0f894ed800c6c 100644
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@ from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+from vllm.reasoning import ReasoningParser
 
 
 class StreamingReasoningReconstructor:
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index a1a81b3891f65..5de1137eaf682 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -5,6 +5,9 @@ Run `pytest tests/samplers/test_beam_search.py`.
 """
 
 import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.assets.audio import AudioAsset
 
 
 @pytest.fixture(autouse=True)
@@ -19,6 +22,7 @@ def v1(run_with_both_engines):
 #   3. Use the model "huggyllama/llama-7b".
 MAX_TOKENS = [64]
 BEAM_WIDTHS = [4]
+MM_BEAM_WIDTHS = [2]
 MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
 
 
@@ -48,15 +52,90 @@ def test_beam_search_single_input(
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-        for i, (hf_text,
+        for j, (hf_text,
                 vllm_text) in enumerate(zip(hf_output_texts,
                                             vllm_output_texts)):
-            print(f">>>{i}-th hf output:")
+            print(f">>>{j}-th hf output:")
             print(hf_text)
-            print(f">>>{i}-th vllm output:")
+            print(f">>>{j}-th vllm output:")
             print(vllm_text)
         assert len(hf_output_ids) == len(vllm_output_ids)
         for j in range(len(hf_output_ids)):
             assert hf_output_ids[j] == vllm_output_ids[j], (
                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
                 f"vLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
+def test_beam_search_passes_multimodal_data(
+    hf_runner,
+    vllm_runner,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Ensure that beam search passes multimodal data through correctly."""
+    # NOTE - this test is primarily to check that mm data is passed to beams
+    # correctly. As such, we just need to check one extra modality to make
+    # sure things pass through properly.
+    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
+    model = "Qwen/Qwen2-Audio-7B-Instruct"
+    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+    prompts = [
+        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
+    ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        audio_token_id = hf_model.config.audio_token_index
+        eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
+        hf_outputs = hf_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    seq_with_no_audio_toks = lambda seq: [
+        tok for tok in seq if tok != audio_token_id
+    ]
+
+    for i in range(len(prompts)):
+        hf_output_ids, hf_output_texts = hf_outputs[i]
+        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
+
+        for j, (hf_text,
+                vllm_text) in enumerate(zip(hf_output_texts,
+                                            vllm_output_texts)):
+            print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
+            print(hf_text)
+            print(f">>>{j}-th vllm output:")
+            print(vllm_text)
+        assert len(hf_output_ids) == len(vllm_output_ids)
+
+        for j in range(len(hf_output_ids)):
+            # Compare everything except for the audio tokens; we do this since
+            # the IDs returned from the transformers helper expands the audio
+            # token to match features, while the vLLM helper maintains the
+            # single audio token in the input text
+            filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
+            filtered_vllm_output_ids = seq_with_no_audio_toks(
+                vllm_output_ids[j])
+
+            # HF output IDs may contain the end of sequence
+            if len(filtered_hf_output_ids
+                   ) == len(filtered_vllm_output_ids) + 1:
+                assert filtered_hf_output_ids[-1] == eos_token_id
+                filtered_hf_output_ids = filtered_hf_output_ids[:-1]
+
+            assert filtered_hf_output_ids == filtered_vllm_output_ids
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b8a2631b91408..b112974754208 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -3,6 +3,7 @@
 tensor parallelism.
 """
 
+import json
 from typing import Optional
 
 import pytest
@@ -28,14 +29,14 @@ from .conftest import run_equality_correctness_test_tp
 @pytest.mark.parametrize("test_llm_kwargs", [
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 3,
         }),
     ],
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "ngram",
             "num_speculative_tokens": 5,
             "prompt_lookup_max": 3,
@@ -88,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     "model, test_llm_kwargs",
     [("JackFram/llama-68m", [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
             "draft_tensor_parallel_size": 1,
@@ -96,7 +97,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     ]),
      ("ibm-granite/granite-3b-code-instruct", [
          "--speculative_config",
-         str({
+         json.dumps({
              "model": "ibm-granite/granite-3b-code-instruct",
              "num_speculative_tokens": 5,
              "draft_tensor_parallel_size": 1,
@@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("model, test_llm_kwargs",
                          [("JackFram/llama-68m", [
                              "--speculative_config",
-                             str({
+                             json.dumps({
                                  "model": "JackFram/llama-68m",
                                  "num_speculative_tokens": 3,
                              }),
                          ]),
                           ("JackFram/llama-68m", [
                               "--speculative_config",
-                              str({
+                              json.dumps({
                                   "model": "JackFram/llama-68m",
                                   "num_speculative_tokens": 3,
                                   "draft_tensor_parallel_size": 1,
                               }),
                           ])])
-@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    if logprobs:
-        test_llm_kwargs.extend(
-            ["--disable_logprobs_during_spec_decoding", "False"])
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative_config",
+                             json.dumps({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                                 "disable_logprobs": False,
+                             }),
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative_config",
+                              json.dumps({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                                  "disable_logprobs": False,
+                              }),
+                          ])])
+@pytest.mark.parametrize("logprobs", [2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2_with_logprobs(
+        model, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
+        batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index d42d9029fef66..a1b7c8b40c39d 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -3,6 +3,8 @@
 tensor parallelism.
 """
 
+import json
+
 import openai
 import pytest
 import torch
@@ -33,7 +35,7 @@ SPEC_MODEL = "JackFram/llama-68m"
         #TODO(wooyeon): add spec_draft_dp=2 case
         [
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "max_model_len": 32,
diff --git a/tests/test_config.py b/tests/test_config.py
index 06264c5b99b95..53db91e81c414 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,14 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import asdict
+from dataclasses import MISSING, Field, asdict, dataclass, field
 
 import pytest
 
-from vllm.config import ModelConfig, PoolerConfig
+from vllm.config import ModelConfig, PoolerConfig, get_field
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
 
+def test_get_field():
+
+    @dataclass
+    class TestConfig:
+        a: int
+        b: dict = field(default_factory=dict)
+        c: str = "default"
+
+    with pytest.raises(ValueError):
+        get_field(TestConfig, "a")
+
+    b = get_field(TestConfig, "b")
+    assert isinstance(b, Field)
+    assert b.default is MISSING
+    assert b.default_factory is dict
+
+    c = get_field(TestConfig, "c")
+    assert isinstance(c, Field)
+    assert c.default == "default"
+    assert c.default_factory is MISSING
+
+
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 40e26ed5199c1..9af810c4c1bca 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,7 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for the SamplingParams class.
 """
+
+import pytest
+
 from vllm import SamplingParams
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+MODEL_NAME = "Qwen/Qwen1.5-7B"
 
 
 def test_max_tokens_none():
@@ -9,6 +16,74 @@ def test_max_tokens_none():
     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 
 
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
+@pytest.fixture(scope="module")
+def model_config():
+    return ModelConfig(
+        MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def default_max_tokens():
+    return 4096
+
+
+def test_sampling_params_from_request_with_no_guided_decoding_backend(
+        model_config, default_max_tokens):
+    # guided_decoding_backend is not present at request level
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # we do not expect any backend to be present and the default
+    # guided_decoding_backend at engine level will be used.
+    assert sampling_params.guided_decoding.backend is None
+
+
+@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
+                         [("xgrammar", "xgrammar"),
+                          ("lm-format-enforcer", "lm-format-enforcer"),
+                          ("outlines", "outlines")])
+def test_sampling_params_from_request_with_guided_decoding_backend(
+        request_level_guided_decoding_backend: str, expected: str,
+        model_config, default_max_tokens):
+
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+        'guided_decoding_backend':
+        request_level_guided_decoding_backend,
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # backend correctly identified in resulting sampling_params
+    assert sampling_params.guided_decoding.backend == expected
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 8406f305215b2..94b0156e104b2 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -47,12 +47,10 @@ def test_filter_subtensors():
 
 @pytest.fixture(scope="module")
 def llama_3p2_1b_files():
-    with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
-                                      cache_dir=cache_dir,
-                                      ignore_patterns=["*.bin*", "original/*"])
+    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
+                                  ignore_patterns=["*.bin*", "original/*"])
 
-        yield input_dir
+    yield input_dir
 
 
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
-        if not any(
-                file.endswith(ext) and not os.path.isdir(file)
-                for ext in weights_patterns):
+        if os.path.isdir(os.path.join(input_dir, file)):
+            continue
+        if not any(file.endswith(ext) for ext in weights_patterns):
             shutil.copy(f"{input_dir}/{file}", output_dir)
 
 
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files):
+                              llama_3p2_1b_files,
+                              monkeypatch: pytest.MonkeyPatch):
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
     gpu_memory_utilization = 0.8
     input_dir = llama_3p2_1b_files
     ctx = mp.get_context("spawn")
+    # The interface in v1 engine has changed, run in v1 engine will hang.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
     # Run in separate processes for memory & CUDA isolation
     with TemporaryDirectory() as output_dir:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3660cfa0e49e2..580e65f1f8338 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,19 +2,22 @@
 # ruff: noqa
 
 import asyncio
+import hashlib
+import pickle
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
 
 import pytest
 import torch
-from vllm_test_utils import monitor
+from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
-                        PlaceholderModule, StoreBoolean, bind_kv_cache,
-                        deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw, swap_dict_values)
+from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
+                        MemorySnapshot, PlaceholderModule, StoreBoolean,
+                        bind_kv_cache, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, sha256,
+                        supports_kw, swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
@@ -140,7 +143,8 @@ def parser():
 def parser_with_config():
     parser = FlexibleArgumentParser()
     parser.add_argument('serve')
-    parser.add_argument('model_tag')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
     parser.add_argument('--served-model-name', type=str)
     parser.add_argument('--config', type=str)
     parser.add_argument('--port', type=int)
@@ -196,29 +200,29 @@ def test_missing_required_argument(parser):
         parser.parse_args([])
 
 
-def test_cli_override_to_config(parser_with_config):
+def test_cli_override_to_config(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', cli_config_file,
         '--tensor-parallel-size', '3'
     ])
     assert args.tensor_parallel_size == 3
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml'
+        cli_config_file
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml', '--port', '666'
+        cli_config_file, '--port', '666'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
-def test_config_args(parser_with_config):
+def test_config_args(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -240,10 +244,9 @@ def test_config_file(parser_with_config):
         ])
 
 
-def test_no_model_tag(parser_with_config):
+def test_no_model_tag(parser_with_config, cli_config_file):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', '--config', './data/test_config.yaml'])
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
 
 
 # yapf: enable
@@ -414,6 +417,129 @@ def test_bind_kv_cache_pp():
         assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
 
 
+class TestLRUCache(LRUCache):
+
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+
+    assert cache.get(2) == 2
+    assert cache.stat() == CacheInfo(hits=1, total=1)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    assert cache[2] == 2
+    assert cache.stat() == CacheInfo(hits=2, total=2)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    assert cache.get(-1) is None
+    assert cache.stat() == CacheInfo(hits=2, total=3)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
@@ -476,3 +602,63 @@ def test_swap_dict_values(obj, key1, key2):
         assert obj[key1] == original_obj[key2]
     else:
         assert key1 not in obj
+
+
+def test_model_specification(parser_with_config,
+                             cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve', '--config', cli_config_file_with_model,
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+    # Test using --model option raises error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With `vllm serve`, you should provide the model as a positional "
+            "argument or in a config file instead of via the `--model` option."
+        ),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model,
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
+
+
+@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
+                                    (None, bool, [1, 2, 3])])
+@pytest.mark.parametrize("output", [0, 1, 2])
+def test_sha256(input: tuple, output: int):
+    hash = sha256(input)
+    assert hash is not None
+    assert isinstance(hash, int)
+    assert hash != 0
+
+    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+
+    # hashing again, returns the same value
+    assert hash == sha256(input)
+
+    # hashing different input, returns different value
+    assert hash != sha256(input + (1, ))
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index b1860e0bb7083..0f8b98a135818 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -4,14 +4,22 @@ from collections.abc import Generator
 from typing import Any, Optional
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
-from vllm.transformers_utils.detokenizer import (Detokenizer,
-                                                 detokenize_incrementally)
+from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
+                                        IncrementalDetokenizer,
+                                        SlowIncrementalDetokenizer)
+
+SPECIAL_TOKS_TRUTH = [
+    "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
+]
 
 TRUTH = [
     "Hello here, this is a simple test",
@@ -22,7 +30,8 @@ TRUTH = [
     # incomplete UTF-8 characters
     # see https://github.com/vllm-project/vllm/pull/9625
     "ပုံပြင်လေးပြောပြပါ်",
-]
+] + SPECIAL_TOKS_TRUTH
+
 TOKENIZERS = [
     "facebook/opt-125m",
     "gpt2",
@@ -38,26 +47,37 @@ TOKENIZERS = [
 ]
 
 
-def _run_incremental_decode(tokenizer, all_input_ids,
-                            skip_special_tokens: bool, starting_index: int):
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    prev_tokens = None
-    for i in range(starting_index, len(all_input_ids)):
-        new_tokens, text, offset, token_offset = detokenize_incrementally(
-            tokenizer,
-            all_input_ids[:i + 1],
-            prev_tokens,
-            offset,
-            token_offset,
-            skip_special_tokens=skip_special_tokens)
-        decoded_text += text
-        if prev_tokens is None:
-            prev_tokens = new_tokens
-        else:
-            prev_tokens += new_tokens
-    return decoded_text
+def _run_incremental_decode(tokenizer,
+                            all_input_ids,
+                            skip_special_tokens: bool,
+                            starting_index: int,
+                            spaces_between_special_tokens: bool = True,
+                            fast: Optional[bool] = None):
+
+    prompt_token_ids = all_input_ids[:starting_index]
+
+    params = SamplingParams(
+        skip_special_tokens=skip_special_tokens,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+    )
+    request = EngineCoreRequest("", "", prompt_token_ids, None, None, None,
+                                params, None, 0.0, None)
+
+    if fast is None:
+        detokenizer = IncrementalDetokenizer.from_new_request(
+            tokenizer, request)
+    elif fast:
+        detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+    else:
+        detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
+
+    output_text = ""
+    for i, token_id in enumerate(all_input_ids[starting_index:]):
+        detokenizer.update([token_id], False)
+        finished = i == len(all_input_ids) - 1
+        output_text += detokenizer.get_next_output_text(finished, delta=True)
+
+    return output_text, detokenizer.output_token_ids
 
 
 @pytest.fixture
@@ -85,11 +105,13 @@ def test_mistral_edge_case(tokenizer, truth):
     starting_index = 0
     all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
 
-    decoded_text = _run_incremental_decode(tokenizer,
-                                           all_input_ids,
-                                           skip_special_tokens=True,
-                                           starting_index=starting_index)
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer,
+        all_input_ids,
+        skip_special_tokens=True,
+        starting_index=starting_index)
     assert decoded_text == truth
+    assert out_ids == all_input_ids[starting_index:]
 
 
 @pytest.fixture
@@ -106,40 +128,86 @@ def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
 @pytest.mark.parametrize("with_prompt", [True, False])
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
-def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
+@pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
+@pytest.mark.parametrize("fast", (True, False))
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
+                          spaces_between_special_tokens, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    if skip_special_tokens and not spaces_between_special_tokens:
+        pytest.skip()
+
+    if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
+        # Fix up inconsistency in fast/slow tokenizer behaviour.
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": [
+                at for at in
+                tokenizer._tokenizer.get_added_tokens_decoder().values()
+                if at.special
+            ]
+        })
+
+    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        else {"spaces_between_special_tokens": spaces_between_special_tokens}
+
+    truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
+    if tokenizer.bos_token_id is not None:
+        truth_tokens.insert(0, tokenizer.bos_token_id)
+    truth_tokens.append(tokenizer.eos_token_id)
+
+    new_truth = tokenizer.decode(truth_tokens,
+                                 skip_special_tokens=skip_special_tokens,
+                                 **extra_decode_args)
+
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
-        prompt_input_ids = truth_tokens[:len(truth) // 2]
-        generated_input_ids = truth_tokens[len(truth) // 2:]
+        num_prompt_tokens = len(
+            tokenizer(truth[:len(truth) // 2],
+                      add_special_tokens=False).input_ids)
+        if tokenizer.bos_token_id is not None:
+            num_prompt_tokens += 1
+
+        prompt_input_ids = truth_tokens[:num_prompt_tokens]
+        generated_input_ids = truth_tokens[num_prompt_tokens:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
         prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens)
-        generated = truth[len(prompt):]
-    else:
-        generated = truth
-        starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
-    if skip_special_tokens:
-        if tokenizer.bos_token_id is not None:
-            all_input_ids = [tokenizer.bos_token_id] + all_input_ids
-            starting_index += 1
-        all_input_ids = all_input_ids + [tokenizer.eos_token_id]
+                                  skip_special_tokens=skip_special_tokens,
+                                  **extra_decode_args)
 
-    decoded_text = _run_incremental_decode(
+        generated = new_truth[len(prompt):]
+    else:
+        generated = new_truth
+        starting_index = 0
+        all_input_ids = truth_tokens
+
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer,
         all_input_ids,
         skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        starting_index=starting_index,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+        fast=fast)
 
     assert decoded_text == generated
+    assert out_ids == all_input_ids[starting_index:]
 
-    decoded_text = _run_incremental_decode(
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("fast", (True, False))
+def test_oov_decode(tokenizer, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer, [len(tokenizer)],
-        skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        skip_special_tokens=True,
+        starting_index=0,
+        spaces_between_special_tokens=True,
+        fast=fast)
 
     assert decoded_text == ''
+    assert out_ids == [len(tokenizer)]
 
 
 @pytest.fixture
@@ -165,15 +233,14 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
                                        tokenizer) -> list[int]:
-    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
-    return complete_sequence_token_ids
+    return tokenizer(complete_sequence, add_special_tokens=False).input_ids
 
 
 def create_sequence(prompt_token_ids=None):
-    prompt_token_ids = prompt_token_ids or [1]
+    prompt_token_ids = prompt_token_ids or []
     return Sequence(
         seq_id=0,
-        inputs=token_inputs(prompt_token_ids, prompt="<s>"),
+        inputs=token_inputs(prompt_token_ids),
         block_size=16,
     )
 
@@ -224,7 +291,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
     assert sequential_result != "".join(sequential_logprobs_text_other_token)
 
-    if skip_special_tokens:
+    if not skip_special_tokens:
         # Text for logprobs for the chosen token should be the same as the
         # generated text. Note that this will only be true if we skip
         # special tokens.
@@ -233,10 +300,23 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
+def test_decode_prompt_logprobs(complete_sequence: str,
+                                complete_sequence_token_ids: list[int],
                                 detokenizer: Detokenizer):
+
+    # We want to use skip_special_tokens=False here but Mistral tokenizers
+    # don't support that.
+    if complete_sequence not in SPECIAL_TOKS_TRUTH:
+        skip_special_tokens = True
+    elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
+                        MistralTokenizer):
+        skip_special_tokens = False
+    else:
+        pytest.skip("MistralTokenizers don't support "
+                    "skip_special_tokens=False")
+        return
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=True,
+    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      prompt_logprobs=1)
 
     # Run sequentially.
@@ -256,8 +336,10 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
     tokenizer = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
+    text_full = tokenizer.decode(token_ids,
+                                 skip_special_tokens=skip_special_tokens)
+    text_first = tokenizer.decode(token_ids[0],
+                                  skip_special_tokens=skip_special_tokens)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 39ab01c9b8741..4bf9b45fe212b 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
 from .utils import ARGS, CONFIGS, ServerConfig
 
 
+# select models to test based on command line arguments
+def pytest_addoption(parser):
+    parser.addoption("--models",
+                     nargs="+",
+                     help="Specify one or more models to test")
+    parser.addoption("--extended",
+                     action="store_true",
+                     default=False,
+                     help="invoke extended tests requiring large GPUs")
+
+
 # for each server config, download the model and return the config
 @pytest.fixture(scope="session", params=CONFIGS.keys())
 def server_config(request):
-    config = CONFIGS[request.param]
+    extended = request.config.getoption("--extended")
+    models = request.config.getoption("--models")
+
+    config_keys_to_test = [
+        key for key in CONFIGS if (models is None or key in models) and (
+            extended or not CONFIGS[key].get("extended", False))
+    ]
+
+    config_key = request.param
+    if config_key not in config_keys_to_test:
+        pytest.skip(f"Skipping config '{config_key}'")
+
+    config = CONFIGS[config_key]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
         pytest.skip("The {} model can't be tested on the ROCm platform".format(
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 7bee56281c7d6..ba0ad78f64675 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -43,7 +43,8 @@ def test_chat_completion_request_with_no_tools():
     assert request.tool_choice == 'none'
 
 
-def test_chat_completion_request_with_tool_choice_but_no_tools():
+@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
     with pytest.raises(ValueError,
                        match="When using `tool_choice`, `tools` must be set."):
         ChatCompletionRequest.model_validate({
@@ -54,7 +55,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto'
+            tool_choice
         })
 
     with pytest.raises(ValueError,
@@ -67,7 +68,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto',
+            tool_choice,
             'tools':
             None
         })
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
new file mode 100644
index 0000000000000..2ab87a0ef41ff
--- /dev/null
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -0,0 +1,336 @@
+# SPDX-License-Identifier: Apache-2.0
+import json
+import re
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+]
+
+
+def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
+                       should_match: bool):
+    self = MagicMock(tool_choice="required", tools=tools)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.fsm.json_schema import build_regex_from_schema
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Berlin",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            },
+            False),
+        (
+            [{  # tool call with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # tool call where parameters are first cannot be generated
+                "parameters": {
+                    "city": "Vienna"
+                },
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # tool call without all required parameters cannot be generated
+                "name": "get_forecast",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }],
+            False),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{
+                "name": "get_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "days": 7
+                }
+            }], False),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }, {}], False),
+    ])
+def test_guided_json(sample_output, should_match):
+    _compile_and_check(tools=TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+def update_parameters_none(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            [{  # function with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # only function with empty parameters object is valid
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            True),
+    ])
+@pytest.mark.parametrize(
+    "update_parameters",
+    [update_parameters_none, update_parameters_empty_dict])
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all([
+        tool.function.parameters is None or tool.function.parameters == {}
+        for tool in tools
+    ])
+    _compile_and_check(tools=tools,
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i:i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned))
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += '{"name": "' + \
+                message.tool_calls[0].function.name  + \
+                    '", "parameters": ' + \
+                        message.tool_calls[0].function.arguments
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 231e4aad8c336..7c87c73f04dad 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
     supports_rocm: Optional[bool]
+    extended: Optional[bool]  # tests do not run in CI automatically
 
 
 def patch_system_prompt(messages: list[dict[str, Any]],
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
         "supports_parallel":
         False,
     },
+    "llama4": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "pythonic", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
+            "4"
+        ],
+        "supports_parallel":
+        False,
+        "extended":
+        True
+    },
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index e70b3e17c6f93..06e00187caf46 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,12 +5,8 @@ import os
 import tempfile
 
 import depyf
-import pytest
-
-from vllm.config import CompilationLevel
 
 
-@pytest.mark.skip(reason="Not working; needs investigation.")
 def test_tpu_compilation():
     temp_dir = tempfile.mkdtemp()
     with depyf.prepare_debug(temp_dir):
@@ -22,27 +18,24 @@ def test_tpu_compilation():
             "The greatest glory in living lies not in never falling,",
         ]
         answers = [
-            " or, through inaction, allow a human being to come to harm.",
-            " what is essential is invisible to the eye.",
-            " but in rising every time we fall.",
+            " or, through inaction",
+            " what is essential ",
+            " but in rising ",
         ]
-        N = 1
+
         # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        N = 1
         sampling_params = SamplingParams(temperature=0.7,
                                          top_p=1.0,
                                          n=N,
                                          max_tokens=16)
 
-        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-        # In real workloads, `enforace_eager` should be `False`.
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_batched_tokens=256,
+                  max_model_len=256,
+                  max_num_seqs=32,
+                  enforce_eager=False)
 
-        # disable custom dispatcher, let Dynamo takes over
-        # all the control
-        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-                  max_model_len=512,
-                  max_num_seqs=64,
-                  enforce_eager=True,
-                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
         outputs = llm.generate(prompts, sampling_params)
         for output, answer in zip(outputs, answers):
             prompt = output.prompt
@@ -51,50 +44,40 @@ def test_tpu_compilation():
             assert generated_text.startswith(answer)
 
     compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
 
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
 
-    # We should only trigger Dynamo compilation 4 times:
-    # 1. forward pass (symbolic)
-    # 2. compute_logits (symbolic)
-    # 3. forward pass (shape 16)
-    # 4. forward pass (shape 32)
-    # and later calls should not trigger Dynamo compilation again.
-    # NOTE: It might still trigger XLA compilation.
-
-    # Check we have 4 compiled codes
-    assert len(compiled_codes) == 4
+    # We should only trigger Dynamo compilation 2 times:
+    # 1. Forward pass without kv_caches
+    # 2. Forward pass with kv_caches
+    # Check we have 2 compiled codes
+    assert len(compiled_codes) == 2
 
     kv_cache_prefix = "kv_cache"
     attn_prefix = "ragged_paged_attention"
 
+    def extract_compiled_index(s):
+        parts = s.replace(".", "_").split("_")
+        numbers = [int(part) for part in parts if part.isdigit()]
+        return numbers[0]
+
     # Check all the compilations are as expected
-    compiled_fns = sorted(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+    compiled_fns = sorted(glob.glob(
+        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+                          key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
 
-    # The first compilation is symbolic, so it should not have any kv_caches
+    # The first compilation should not have any kv_caches
     with open(compiled_fns[0]) as f:
         content = f.read()
         assert kv_cache_prefix not in content
 
-    # The second compilation is symbolic, so it should not have any kv_caches
+    # The second compilation should have kv_caches and the
+    # ragged_paged_attention
     with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-
-    # The third compilation is shape 16, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[2]) as f:
-        content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
-
-    # The forth compilation is shape 32, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[3]) as f:
         content = f.read()
         assert (kv_cache_prefix in content and attn_prefix in content)
diff --git a/tests/utils.py b/tests/utils.py
index a827b6d4b9bfe..8f8c102b73b8e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -104,6 +104,9 @@ class RemoteOpenAIServer:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
         # download the model before starting the server to avoid timeout
         is_local = os.path.isdir(model)
         if not is_local:
@@ -317,6 +320,37 @@ def _test_completion_close(
     return results
 
 
+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -512,6 +546,8 @@ def compare_all_settings(model: str,
                 results += _test_completion(client, model, prompt, token_ids)
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
                     client, model,
@@ -579,7 +615,16 @@ def multi_process_parallel(
     # as compared to multiprocessing.
     # NOTE: We need to set working_dir for distributed tests,
     # otherwise we may get import errors on ray workers
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    # NOTE: Force ray not to use gitignore file as excluding, otherwise
+    # it will not move .so files to working dir.
+    # So we have to manually add some of large directories
+    os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
+    ray.init(
+        runtime_env={
+            "working_dir": VLLM_PATH,
+            "excludes":
+            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+        })
 
     distributed_init_port = get_open_port()
     refs = []
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 3fecb517c4369..e2f8fd1999c45 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,10 +3,17 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, PrefixCachingMetrics,
+from vllm.utils import GiB_bytes, sha256
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+# disable yapf here as it formats differently than isort such that both fail
+# yapf: disable
+from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         PrefixCachingMetrics,
+                                         estimate_max_model_len,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens,
@@ -16,6 +23,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
+# yapf: enable
+
 
 def make_request(request_id,
                  prompt_token_ids,
@@ -40,6 +49,24 @@ def make_request(request_id,
     )
 
 
+def new_kv_cache_spec(block_size=16,
+                      num_kv_heads=2,
+                      head_size=64,
+                      dtype=torch.float32,
+                      use_mla=False):
+    return FullAttentionSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla)
+
+
+def test_none_hash():
+    assert NONE_HASH is not None
+    assert isinstance(NONE_HASH, int)
+    assert NONE_HASH != 0
+
+
 def test_kv_cache_block():
     # Test KVCacheBlock initialization
     block = KVCacheBlock(block_id=0)
@@ -146,13 +173,10 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 5
-        }, {
-            "offset": 10,
-            "length": 5
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+            PlaceholderRange(offset=10, length=5),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -190,36 +214,35 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
     assert next_mm_idx == 0
 
 
-def test_hash_block_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_block_tokens(hash_fn):
     parent_block_hash = 123
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
-    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
-                                   extra_keys)
+    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                   curr_block_token_ids, extra_keys)
     assert isinstance(block_hash, BlockHashType)
-    assert block_hash.hash_value == hash(
+    assert block_hash.hash_value == hash_fn(
         (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
 
-def test_hash_request_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert isinstance(block_hashes[0], BlockHashType)
@@ -234,39 +257,35 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
-def test_hash_tokens_different_mm_input():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
-    block_hashes1 = hash_request_tokens(block_size, request1)
-    block_hashes2 = hash_request_tokens(block_size, request2)
+    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
+    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
     assert block_hashes1[0] != block_hashes2[0]
     assert block_hashes1[1] != block_hashes2[1]
 
 
-def test_hash_request_tokens_no_mm_inputs():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens_no_mm_inputs(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -275,7 +294,7 @@ def test_hash_request_tokens_no_mm_inputs():
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert block_hashes[0].token_ids == (0, 1, 2)
@@ -321,18 +340,6 @@ def test_metrics():
 
 
 def test_unify_kv_cache_configs():
-
-    def new_kv_cache_spec(block_size=16,
-                          num_kv_heads=2,
-                          head_size=64,
-                          dtype=torch.float32,
-                          use_mla=False):
-        return FullAttentionSpec(block_size=block_size,
-                                 num_kv_heads=num_kv_heads,
-                                 head_size=head_size,
-                                 dtype=dtype,
-                                 use_mla=use_mla)
-
     same_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
@@ -422,3 +429,99 @@ def test_unify_kv_cache_configs():
     ]
     with pytest.raises(AssertionError):
         unify_kv_cache_configs(diff_kv_cache_config)
+
+
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "want_estimated_max_len"), [
+        ("Qwen/Qwen1.5-7B", 16385, 16384),
+        ("Qwen/Qwen1.5-7B", 16383, 16383),
+    ])
+def test_estimate_max_model_len(model_id, max_model_len,
+                                want_estimated_max_len):
+    # Create a VllmConfig
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    # Create KV cache specs
+    kv_cache_spec = {}
+    for i in range(32):
+        layer_name = f"layer_{i}"
+        kv_cache_spec[layer_name] = FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=32,
+            head_size=128,
+            dtype=torch.float16,
+            use_mla=False,
+        )
+    # Estimate the maximum model length, 16384 model_len need 8GB
+    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                               8 * GiB_bytes)
+    assert estimated_max_len == want_estimated_max_len
+
+
+def test_allocate_with_lookahead():
+    """Verify that lookahead tokens correctly affect block allocation"""
+    block_size = 4
+    config = KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer1": KVCacheTensor(100),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer1"],
+                             new_kv_cache_spec(block_size=block_size)),
+        ],
+    )
+
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    # Test case 1: Requires additional lookahead tokens
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
+    )
+    assert len(blocks) == 2  # ceil(5/4)=2 blocks
+
+    # Test case 2: With precomputed blocks
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    # required_blocks = ceil((3 + 2) /4) = 2
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,
+    )
+    assert len(blocks) == 2
+
+    # Test case 3: With precomputed blocks
+    # required_blocks = ceil((3 + 4) / 4) = 2
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=4,
+    )
+    assert len(blocks) == 2
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 6129752bcdd65..1b238d47c03a3 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -4,14 +4,17 @@
 from typing import Optional
 
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv
+from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_block_tokens)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 
 
 def make_request(request_id,
@@ -39,16 +42,30 @@ def make_request(request_id,
     )
 
 
-def test_prefill():
-    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
-        max_model_len=8192,
-        sliding_window=None,
-        enable_caching=True,
-        num_preallocate_tokens=16,
+def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
     )
 
+
+@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
+def test_prefill(hash_algo):
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        caching_hash_algo=hash_algo,
+    )
+
+    # choose the hash function according to the parameter
+    hash_fn = sha256 if hash_algo == "sha256" else hash
+
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
 
@@ -62,19 +79,20 @@ def test_prefill():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -84,16 +102,16 @@ def test_prefill():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -101,14 +119,14 @@ def test_prefill():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
@@ -116,33 +134,33 @@ def test_prefill():
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 8]
+    assert [b.block_id for b in blocks] == [6]
 
-    # Although we only have 5 free blocks, we have 8 blocks in
+    # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    assert manager.block_pool.free_block_queue.num_free_blocks == 6
     assert all([
         b.ref_cnt == 0
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ])
     assert len([
         b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 5
+    ]) == 6
 
     manager.free(req2)
 
     # Cache miss and eviction.
-    req3 = make_request("3", [99] * (16 * 9))
+    req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
+    blocks = manager.allocate_slots(req3, 16 * 10, computed_blocks)
     # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
+    assert [b.block_id for b in blocks] == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1]
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -156,13 +174,12 @@ def test_prefill_plp():
     3. Schedule plp request; no hit should occur; validate blocks
     '''
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
+    # the default hash function is hash
+    hash_fn = hash
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -178,20 +195,21 @@ def test_prefill_plp():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0_block_hashes = [b.block_hash for b in blocks]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -202,16 +220,16 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -219,14 +237,14 @@ def test_prefill_plp():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -242,7 +260,7 @@ def test_prefill_plp():
     block_ids = [b.block_id for b in blocks]
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [0, 1, 2, 3, 4]
+    assert block_ids != [1, 2, 3, 4]
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -254,12 +272,9 @@ def test_prefill_plp():
 
 def test_decode():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -273,7 +288,7 @@ def test_decode():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -281,38 +296,25 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
-
-    # Append slots without allocating a new block, but start using the
-    # preallocated block.
-    req0.num_computed_tokens = 59
-    # 6 tokens to fill the previous block, and 10 tokens to fill
-    # the preallocated block.
-    for _ in range(5 + 10):
-        req0.append_output_token_ids(7)
-    new_blocks = manager.allocate_slots(req0, 15)
-    assert new_blocks is not None and len(new_blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
-    req0.num_computed_tokens = 74
-    # 6 tokens to fill the previous block, and 10 tokens to fill
+    req0.num_computed_tokens = 59
+    # 9 tokens to fill the previous block, and 10 tokens to fill
     # the preallocated block.
-    for _ in range(6 + 11):
-        req0.append_output_token_ids(12)
-    new_blocks = manager.allocate_slots(req0, 17)
-    # Plus one preallocated block.
-    assert new_blocks is not None and len(new_blocks) == 2
+    for _ in range(9 + 10):
+        req0.append_output_token_ids(7)
+    new_blocks = manager.allocate_slots(req0, 19)
+    assert new_blocks is not None and len(new_blocks) == 1
+    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
 
 def test_evict():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     last_token_id = 5 * 16 + 7
@@ -321,7 +323,7 @@ def test_evict():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
-    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
+    assert len(blocks) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
@@ -333,7 +335,8 @@ def test_evict():
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
-    assert manager.block_pool.free_block_queue.num_free_blocks == 0
+    # 10 - (6 + 3) == 1
+    assert manager.block_pool.free_block_queue.num_free_blocks == 1
 
     manager.free(req0)
     manager.free(req1)
@@ -341,16 +344,16 @@ def test_evict():
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
+    ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert [b.block_id for b in computed_blocks] == [1, 2]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 5]
-    assert manager.block_pool.free_block_queue.num_free_blocks == 6
+    assert [b.block_id for b in blocks] == [10]
+    assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
 def test_hash_block_correct_reuse():
@@ -360,12 +363,9 @@ def test_hash_block_correct_reuse():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=1,
+        make_kv_cache_config(16, 2),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate 1 block and cache it.
@@ -399,12 +399,9 @@ def test_computed_blocks_not_evicted():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=2,
+        make_kv_cache_config(block_size, 3),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate a block and cache it.
@@ -415,7 +412,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 0
+    assert blocks[0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
@@ -424,7 +421,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -435,13 +432,13 @@ def test_computed_blocks_not_evicted():
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks) == 1
-    assert computed_blocks[0].block_id == 0
+    assert computed_blocks[0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                     computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -450,12 +447,9 @@ def test_basic_prefix_caching_disabled():
     """
     block_size = 4
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=4,
+        make_kv_cache_config(block_size, 5),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=False,
-        num_preallocate_tokens=0,
     )
 
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
@@ -486,43 +480,8 @@ def test_basic_prefix_caching_disabled():
     assert not blocks
 
 
-@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
-@pytest.mark.parametrize("block_size", [4])
-def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
-    """
-    This tests that the preallocated blocks are correctly added.
-    """
-    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
-        max_model_len=8192,
-        sliding_window=None,
-        enable_caching=True,
-        num_preallocate_tokens=num_preallocate_tokens,
-    )
-    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
-
-    req = make_request("0", list(range(block_size * 30)))
-    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks
-    assert num_computed_tokens == 0
-    # Just ask for 1 block.
-    blocks = manager.allocate_slots(req, block_size, computed_blocks)
-    req.num_computed_tokens = block_size
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-    # Assume all computed, only when num_preallocate_tokens > 0, we need to
-    # consume the previously preallocated blocks.
-    if num_preallocated_blocks > 0:
-        manager.allocate_slots(req, block_size * (len(blocks) - 1))
-        req.num_computed_tokens = block_size * len(blocks)
-
-    # Append 1 block.
-    blocks = manager.allocate_slots(req, block_size)
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-
-def test_cache_blocks():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_cache_blocks(hash_fn):
     """
     This is a unit test that tests the correctness of the _cache_full_blocks
     function of KVCacheManager.
@@ -550,6 +509,7 @@ def test_cache_blocks():
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -564,6 +524,7 @@ def test_cache_blocks():
         num_cached_blocks=2,
         num_full_blocks=3,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
@@ -574,12 +535,9 @@ def test_mm_prefix_caching():
     This tests that the multi-modal prefix caching is correct.
     """
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -617,7 +575,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -655,12 +613,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -711,12 +666,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
 def test_reset_prefix_cache():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
@@ -724,7 +676,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -733,7 +685,7 @@ def test_reset_prefix_cache():
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert len(computed_blocks) == 3
     blocks = manager.allocate_slots(req1, 7, computed_blocks)
-    assert [b.block_id for b in blocks] == [4]
+    assert [b.block_id for b in blocks] == [5]
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
@@ -746,3 +698,25 @@ def test_reset_prefix_cache():
     assert manager.reset_prefix_cache()
     assert not manager.block_pool.cached_block_hash_to_block
     assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
+
+
+def test_prefix_cache_stats_disabled():
+    """Test that prefix_cache_stats is None when log_stats is False."""
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        log_stats=False,  # Disable logging stats
+    )
+    assert manager.prefix_cache_stats is None
+
+    # Call all functions that check whether log_stats is disabled.
+    req = make_request("0", list(range(16)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req, 16, computed_blocks)
+    manager.reset_prefix_cache()
+
+    # Ensure prefix_cache_stats remains None
+    assert manager.prefix_cache_stats is None
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c12f2fd594385..560a60a814461 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
+from unittest.mock import Mock
 
 import pytest
+import torch
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
@@ -21,6 +26,11 @@ def create_scheduler(
     max_num_batched_tokens: int = 8192,
     enable_prefix_caching: Optional[bool] = None,
     long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -35,11 +45,15 @@ def create_scheduler(
     Returns:
       :class:`Scheduler` instance
     '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_num_batched_tokens,
+        max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
     )
     model_config = ModelConfig(
         model=model,
@@ -55,24 +69,37 @@ def create_scheduler(
         'enable_prefix_caching': enable_prefix_caching
     })
     cache_config = CacheConfig(
-        block_size=16,
+        block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
         **kwargs_cache,
     )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
     )
-    cache_config.num_gpu_blocks = 10000
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    cache_config.num_gpu_blocks = num_blocks
     return Scheduler(
-        scheduler_config,
-        model_config,
-        cache_config,
-        speculative_config=None,
-        lora_config=None,
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
     )
@@ -244,7 +271,9 @@ def test_schedule_partial_requests():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        # Only the first request has a sampled token id because
+        # the rest requests are still being prefilled.
+        sampled_token_ids=[[0], [], []],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -265,8 +294,61 @@ def test_schedule_partial_requests():
     assert requests[2].request_id not in output.num_scheduled_tokens
 
 
+def test_no_mm_input_chunking():
+    # Disable multimodal input chunking.
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        disable_chunked_mm_input=True,
+        max_model_len=2048,
+    )
+    mm_positions = [[PlaceholderRange(offset=400, length=800)]]
+    requests = create_requests(num_requests=1,
+                               num_tokens=1200,
+                               mm_positions=mm_positions)
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # We want to only see the 400 text tokens at the start scheduled
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(output.scheduled_cached_reqs) == 1
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+
+    # Test that we fail if we disable chunked mm input and use too small
+    # of a max_num_batched_tokens for the mm input.
+    with pytest.raises(ValueError):
+        _ = create_scheduler(
+            model="llava-hf/llava-1.5-7b-hf",
+            max_num_batched_tokens=100,
+            disable_chunked_mm_input=True,
+        )
+
+
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
-def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     """Test scheduling behavior with concurrent partial requests.
 
     This test verifies that: there are multiple long prefill requests in the
@@ -304,7 +386,7 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -325,6 +407,14 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     # Schedule the third step. All three requests are running.
     # First and second requests are in the decode stage.
     # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
     scheduler.update_from_output(output1, model_runner_output)
     output2 = scheduler.schedule()
     assert len(scheduler.running) == 3
@@ -590,3 +680,482 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         prompt_logprobs_dict={},
     )
     scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
+        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (6, 3)),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    scheduler_stats = engine_core_outputs.scheduler_stats
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None
+        stats = scheduler_stats.spec_decoding_stats
+        assert stats.num_draft_tokens == expected[0]
+        assert stats.num_accepted_tokens == expected[1]
+
+
+def _assert_right_scheduler_output(
+    output: SchedulerOutput,
+    num_requests: int,
+    expected_num_scheduled_tokens: int,
+):
+    """Check if SchedulerOutput is correct after remote KV cache hit."""
+
+    # We should inject the kv_connector_metadata.
+    assert len(output.kv_connector_metadata.requests) == num_requests
+
+    # Only num_tokens - matched_num_new_tokens should be scheduled.
+    for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+        assert num_scheduled_tokens == expected_num_scheduled_tokens
+
+
+def _assert_right_kv_cache_manager(
+    scheduler: Scheduler,
+    req_ids: list[str],
+    num_tokens: int,
+    block_size: int,
+    num_requests: int,
+    num_total_blocks: int,
+):
+    """Check whether KVCacheManager is correct after allocate."""
+
+    # Make sure the request stats are right.
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
+    for req_id in req_ids:
+        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
+        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
+                EXPECTED_TOTAL_BLOCKS)
+        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS
+
+    # Make sure we actually touched all the blocks.
+    BLOCKS_PER_REQ = num_tokens / block_size
+    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
+            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+
+
+def _step_until_done(
+    scheduler: Scheduler,
+    output: SchedulerOutput,
+    model_runner_output: ModelRunnerOutput,
+):
+    """Loop over schedule(), update_from_output() until finished."""
+
+    all_finished = False
+    _ = scheduler.update_from_output(output, model_runner_output)
+    while not all_finished:
+        # Schedule + a few iterations until stopping.
+        output = scheduler.schedule()
+        assert len(scheduler.running)
+        for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+            # We should be in the decode phase now.
+            assert num_scheduled_tokens == 1
+        assert len(output.kv_connector_metadata.requests) == 0
+        ecos = scheduler.update_from_output(output, model_runner_output)
+        all_done = True
+        for eco in ecos.outputs:
+            if eco.finish_reason is None:
+                all_done = False
+        all_finished = all_done
+
+
+def test_kv_connector_basic():
+    """
+    Test whether Scheduler with KVConnector schedules tokens, allocates
+    memory, and cleans up requests as expected under normal operation.
+    """
+
+    # Setup Scheduler.
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+    )
+    NUM_TOTAL_BLOCKS = (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
+    BLOCK_SIZE = scheduler.cache_config.block_size
+
+    # Mock External Cache Hit.
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    ######################################################
+    # FIRST SET OF REQUESTS - External Hit Only
+    NUM_REQUESTS = 2
+    NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
+    MAX_TOKENS = 3
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Ensure ScheduleOutput is correct.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens should be scheduled.
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+    ######################################################
+    # SECOND SET OF REQUESTS - Local And External Hit
+    NUM_TOKENS_PREFIX = NUM_TOKENS
+    # We will get a local prefix cache hit for the first
+    # NUM_TOKENS_PREFIX tokens since they are used above.
+    NUM_TOKENS = NUM_TOKENS_PREFIX * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # We should get a local cache hit of NUM_TOKENS_PREFIX and
+    # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens after local + remote cache hit.
+        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
+                                       NUM_MATCHED_NEW_TOKENS))
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+
+def test_kv_connector_unable_to_allocate():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+def test_kv_connector_handles_preemption():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 2
+    # NOTE: there is 1 null block, so this is 6 blocks.
+    NUM_BLOCKS = 7
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests.
+    # Both can be scheduled at first, but the second request
+    # will be preempted and re-scheduled.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = BLOCK_SIZE * 2 + 1
+    MAX_TOKENS = BLOCK_SIZE * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # All can be scheduled - 1st token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 2 remote kv cache hits.
+        num_requests=2,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # All can be scheduled - 2nd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # This will generate a new block and cause a preemption - 3rd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+    # Restarts the preempted request - generate 3rd token.
+    # This will have a local and remote cache hit.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 1 remote kv_cache hit!
+        num_requests=1,
+        # Only 1 block was preempted and there is a single
+        # remote hit. So only single new token is scheduled.
+        expected_num_scheduled_tokens=1,
+    )
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
new file mode 100644
index 0000000000000..9b4ab5fa8b121
--- /dev/null
+++ b/tests/v1/core/test_specialized_manager.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.specialized_manager import SlidingWindowManager
+from vllm.v1.kv_cache_interface import SlidingWindowSpec
+
+
+def test_sliding_window_possible_cached_prefix():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHashType(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[block_hash] = {
+                    i: block_pool.blocks[i + 10]
+                }
+
+        computed_blocks = manager.find_longest_cache_hit(block_hash_list)
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, True, True,
+        True
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
+
+
+def test_sliding_window_remove_skipped_blocks():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids):
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table, ids):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    removed = manager.remove_skipped_blocks(block_table, 0)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 4 tokens are computed. Only token 0 is out of the sliding window. As
+    # block 1000 also contains token 1 that is in the sliding window, block 1000
+    # cannot be removed.
+    removed = manager.remove_skipped_blocks(block_table, 4)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
+    # Block 1000 can be removed.
+    removed = manager.remove_skipped_blocks(block_table, 5)
+    assert_block_id(removed, [original_block_ids[0]])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 6 tokens are computed. Token 0-2 are out of the sliding window.
+    # Cannot remove new block as the block 1001 is still used by token 3.
+    removed = manager.remove_skipped_blocks(block_table, 6)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 7 tokens are computed. Token 0-3 are out of the sliding window.
+    # Block 1001 can be removed and block 1000 is already removed.
+    removed = manager.remove_skipped_blocks(block_table, 7)
+    assert_block_id(removed, [original_block_ids[1]])
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+
+    # 11 tokens are computed. Token 0-7 are out of the sliding window.
+    # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
+    # sequence, and is expected to be evicted earlier than 1002, so the order
+    # of removed blocks should be [1003, 1002].
+    removed = manager.remove_skipped_blocks(block_table, 11)
+    assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
+    assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index a8079dcce5e2f..48c265560348c 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 from vllm import LLM, SamplingParams
 
+from ...utils import fork_new_process_for_each_test
 
-def test_cascade_attention(example_system_message, monkeypatch):
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("attn_backend",
+                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
new file mode 100644
index 0000000000000..a125d3fb79750
--- /dev/null
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
+                                                               prep_prompts)
+
+
+@dataclass
+class TestConfig:
+    sliding_window: int
+    ln_range: tuple[int, int]
+
+
+model_config = {
+    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bigcode/starcoder2-3b",  # sliding window only
+        "google/gemma-2-2b-it",  # sliding window + full attention
+    ])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        test_config = model_config[model]
+
+        llm = LLM(model=model)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        prompts, answer, indices = prep_prompts(batch_size,
+                                                ln_range=test_config.ln_range)
+
+        check_length(prompts, llm, test_config.sliding_window)
+
+        # Fresh generation
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+        # Re-generate with the same prompts to test prefix caching
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+
+def check_length(prompts: list[str], llm: LLM, sliding_window: int):
+    """
+    Check if the prompt length is valid, i.e., longer than the sliding window 
+    size and shorter than the model's max length.
+
+    Args:
+        prompts: list of prompts
+        llm: LLM object
+        sliding_window: Sliding window size
+    """
+    tokenizer = llm.get_tokenizer()
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    assert any(
+        len(tokenizer.encode(prompt)) > sliding_window
+        for prompt in prompts), "Prompt is too short for test"
+    assert all(
+        len(tokenizer.encode(prompt)) <= max_model_len
+        for prompt in prompts), "Prompt is too long for test"
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
similarity index 65%
rename from tests/v1/e2e/test_ngram_spec_decode.py
rename to tests/v1/e2e/test_spec_decode.py
index 7c7c2f02c0785..673714980592a 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -53,6 +53,11 @@ def model_name():
     return "meta-llama/Meta-Llama-3-8B-Instruct"
 
 
+@pytest.fixture
+def eagle_model_name():
+    return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
+
+
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -95,3 +100,47 @@ def test_ngram_correctness(
         # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.7 * len(ref_outputs))
         del spec_llm
+
+
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    eagle_model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "eagle",
+                "model": eagle_model_name,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
+        del spec_llm
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 02470ca92f47f..9b2f1a9199319 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from argparse import ArgumentError
+
 import pytest
 
 from vllm import envs
@@ -32,6 +34,24 @@ def test_prefix_caching_from_cli():
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
     assert vllm_config.cache_config.enable_prefix_caching
 
+    # default hash algorithm is "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # set hash algorithm to sha256
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to builtin
+    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # an invalid hash algorithm raises an error
+    parser.exit_on_error = False
+    with pytest.raises(ArgumentError):
+        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
+
 
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
@@ -44,15 +64,17 @@ def test_defaults_with_usage_context():
         # For H100 and H200, we use larger default values.
         default_llm_tokens = 16384
         default_server_tokens = 8192
+        default_max_num_seqs = 1024
     else:
         default_llm_tokens = 8192
         default_server_tokens = 2048
+        default_max_num_seqs = 256
 
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
 
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config = engine_args.create_engine_config(
         UsageContext.OPENAI_API_SERVER)
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index ca5ff8fa84544..3f3109c1484ca 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -231,8 +231,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: int,
+                                     max_tokens: int) -> EngineCoreRequest:
         request = make_request()
+        request.request_id = req_id
         request.sampling_params.max_tokens = max_tokens
         return request
 
@@ -279,6 +281,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             # Avoid all requests being scheduled once.
             enable_prefix_caching=False,
             max_num_batched_tokens=10,
+            # Reduce startup time.
+            enforce_eager=True,
         )
         vllm_config = engine_args.create_engine_config()
         engine_core = EngineCore(vllm_config=vllm_config,
@@ -286,13 +290,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
                                  executor_class=DummyExecutor)
         assert engine_core.batch_queue is not None
 
-        # Add two requests in a row.
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
+        # Add two requests in a row. Each request have 12 prompt tokens.
+        req0 = make_request_with_max_tokens(0, 5)
+        engine_core.add_request(req0)
+        req1 = make_request_with_max_tokens(1, 5)
+        engine_core.add_request(req1)
 
-        # First saturate the batch queue.
+        # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 1
         assert engine_core.step_with_batch_queue() is None
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 48f451a589688..8ebdaf63b4847 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,8 +3,10 @@
 import asyncio
 import time
 import uuid
+from threading import Thread
 from typing import Optional
 
+import psutil
 import pytest
 from transformers import AutoTokenizer
 
@@ -167,11 +169,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
             core_client: SyncMPClient = client
 
-            result = core_client._call_utility("echo", "testarg")
+            result = core_client.call_utility("echo", "testarg")
             assert result == "testarg"
 
             with pytest.raises(Exception) as e_info:
-                core_client._call_utility("echo", None, "help!")
+                core_client.call_utility("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
 
@@ -238,10 +240,49 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
         core_client: AsyncMPClient = client
 
-        result = await core_client._call_utility_async("echo", "testarg")
+        result = await core_client.call_utility_async("echo", "testarg")
         assert result == "testarg"
 
         with pytest.raises(Exception) as e_info:
-            await core_client._call_utility_async("echo", None, "help!")
+            await core_client.call_utility_async("echo", None, "help!")
 
         assert str(e_info.value) == "Call to echo method failed: help!"
+
+
+@pytest.mark.timeout(10)
+def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        # Start another thread to wait for engine core process to start
+        # and kill it - simulate fatal uncaught process exit.
+        this_proc = psutil.Process()
+        children_before = set(this_proc.children())
+
+        def kill_first_child():
+            while True:
+                time.sleep(0.5)
+                children = set(this_proc.children()) - children_before
+                if children:
+                    child = children.pop()
+                    print("Killing child core process", child.pid)
+                    child.kill()
+                    break
+
+        Thread(target=kill_first_child, daemon=True).start()
+
+        _core_client = EngineCoreClient.make_client(
+            multiprocess_mode=True,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=True,
+        )
+
+    assert "Engine core initialization failed" in str(e_info.value)
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 9ac42dbc34a49..f8d96caf1a276 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -921,3 +921,84 @@ async def test_request_output_collector():
     # Cumulative logprobs should be the last one.
     cumulative_logprob_expected = 1.0 * num_to_put
     assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+
+@pytest.mark.asyncio
+async def test_cumulative_output_collector_n():
+    """Test collector correctly handles multiple outputs by index."""
+    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    outputs = [
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="a",
+                    token_ids=[0],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=1,
+                    text="b",
+                    token_ids=[1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="ab",
+                    token_ids=[0, 1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=2,
+                    text="c",
+                    token_ids=[2],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+    ]
+    for output in outputs:
+        collector.put(output)
+
+    # Get the output and check that the text and token_ids are correct.
+    result = await collector.get()
+    # We are expecting
+    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
+    assert len(result.outputs) == 3
+    # First is the one where index is 0
+    first = [k for k in result.outputs if k.index == 0]
+    assert len(first) == 1
+    assert first[0].text == "ab"
+
+    # Second is the one where index is 1
+    second = [k for k in result.outputs if k.index == 1]
+    assert len(second) == 1
+    assert second[0].text == "b"
+    assert second[0].token_ids == [1]
+
+    # Third is the one where index is 2
+    third = [k for k in result.outputs if k.index == 2]
+    assert len(third) == 1
+    assert third[0].text == "c"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 6bdfa0fae4a2c..c243d81e7f183 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,35 +4,76 @@ from __future__ import annotations
 
 import json
 import re
+from enum import Enum
 from typing import Any
 
 import jsonschema
 import pytest
+from pydantic import BaseModel
 
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
-MODELS_TO_TEST = [
-    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "mistral"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    #FIXME: This test is flaky on CI thus disabled
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
+]
+
+PARAMS_MODELS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
 ]
 
 
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion(
+@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
+                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
+    unsupported_json_schema: dict[str, Any],
+    sample_sql_ebnf: str,
+    sample_sql_lark: str,
+    sample_regex: str,
+    sample_guided_choice: str,
     guided_decoding_backend: str,
+    tokenizer_mode: str,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
     llm = LLM(model=model_name,
+              enforce_eager=True,
               max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+              guided_decoding_backend=guided_decoding_backend,
+              tokenizer_mode=tokenizer_mode)
+
+    #
+    # Test 1: Generate JSON output based on a provided schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
@@ -53,68 +94,15 @@ def test_guided_json_completion(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
+        if 'disable-any-whitespace' in guided_decoding_backend:
+            assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_disable_any_whitespace(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    if guided_decoding_backend != "xgrammar":
-        pytest.skip("disable-any-whitespace is only supported for xgrammar.")
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_object(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 2: Generate JSON object without a schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=100,
@@ -137,38 +125,18 @@ def test_guided_json_object(
             print(generated_text)
             assert generated_text is not None
 
-            # Parse to verify it is valid JSON
+            # Parse to verify it is a valid JSON object
             parsed_json = json.loads(generated_text)
-            allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend == "xgrammar":
-                # TODO - we are currently too permissive with xgrammar and
-                # allow # any valid json (typically comes back as a list or
-                # object).  We can fix this by specifying a jsonschema of
-                # {"type": "object"}, # but we need this fix in a release
-                # first: https://github.com/mlc-ai/xgrammar/pull/264
-                allowed_types = (dict, list)
-            assert isinstance(parsed_json, allowed_types)
+            assert isinstance(parsed_json, dict)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_unsupported_schema(
-    monkeypatch: pytest.MonkeyPatch,
-    unsupported_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 3: test a jsonschema incompatible with xgrammar
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend == "xgrammar":
+    if guided_decoding_backend.startswith("xgrammar"):
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
@@ -179,8 +147,6 @@ def test_guided_json_unsupported_schema(
                          sampling_params=sampling_params,
                          use_tqdm=True)
     else:
-        # This should work for both "guidance" and "auto".
-
         outputs = llm.generate(
             prompts=("Give an example JSON object for a grade "
                      "that fits this schema: "
@@ -199,21 +165,9 @@ def test_guided_json_unsupported_schema(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_ebnf: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 4: Generate SQL statement using EBNF grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -243,21 +197,9 @@ def test_guided_grammar_ebnf(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_lark(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_lark: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 5: Generate SQL statement using Lark grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -292,20 +234,9 @@ def test_guided_grammar_lark(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf_invalid(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 6: Test invalid grammar input
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -319,21 +250,9 @@ def test_guided_grammar_ebnf_invalid(
             use_tqdm=True,
         )
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_regex(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_regex: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 7: Generate text based on a regex pattern
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -357,21 +276,9 @@ def test_guided_regex(
         assert re.fullmatch(sample_regex, generated_text) is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_choice_completion(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 8: Generate text based on a choices
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -390,3 +297,118 @@ def test_guided_choice_completion(
         assert generated_text is not None
         assert generated_text in sample_guided_choice
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    #
+    # Test 9: Generate structured output using a Pydantic model with an enum
+    #
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+    #
+    # Test 10: Generate structured with minLength and maxLength
+    #
+    min_length = 50
+    max_length = 50
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "description": {
+                "type": "string",
+                "maxLength": max_length,
+                "minLength": min_length
+            }
+        },
+        "required": ["description"]
+    }
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a description of a frog using 50 characters.",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("model_name, tokenizer_mode",
+                         PARAMS_MODELS_TOKENIZER_MODE)
+def test_structured_output_auto_mode(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    model_name: str,
+    tokenizer_mode: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend="auto",
+              tokenizer_mode=tokenizer_mode)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+
+    prompts = ("Give an example JSON object for a grade "
+               "that fits this schema: "
+               f"{unsupported_json_schema}")
+    # This would fail with the default of "xgrammar", but in "auto"
+    # we will handle fallback automatically.
+    outputs = llm.generate(prompts=prompts,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    # Make sure `auto` backend handling doesn't mess up sampling_params
+    # and that we can reuse it without error.
+    outputs.extend(
+        llm.generate(prompts=prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=True))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(generated_text)
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
new file mode 100644
index 0000000000000..8a5076412cfae
--- /dev/null
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from torch import Generator
+
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+DEVICE = "cuda"
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+
+def test_topk_impl_equivalance():
+
+    with torch.device(DEVICE):
+        generator = Generator(device=DEVICE).manual_seed(33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+        # Random top-k values between 1 and 9.
+        k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(
+            torch.randint(0,
+                          2, (BATCH_SIZE, ),
+                          generator=generator,
+                          dtype=bool), VOCAB_SIZE)
+
+        # Top-k only implementation
+        result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+        # Top-p + top-k
+        no_op_top_p = torch.tensor([1.0])
+        result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+        assert torch.allclose(result1, result2)
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
new file mode 100644
index 0000000000000..ed368fe828d07
--- /dev/null
+++ b/tests/v1/shutdown/test_delete.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("send_one_request", [False, True])
+async def test_async_llm_delete(model: str, tensor_parallel_size: int,
+                                send_one_request: bool) -> None:
+    """Test that AsyncLLM frees GPU memory upon deletion.
+    AsyncLLM always uses an MP client.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Instantiate AsyncLLM; make request to complete any deferred
+    # initialization; then delete instance
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    if send_one_request:
+        async for _ in async_llm.generate(
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(
+                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            pass
+    del async_llm
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("send_one_request", [False, True])
+def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
+                    enable_multiprocessing: bool,
+                    send_one_request: bool) -> None:
+    """Test that LLM frees GPU memory upon deletion.
+    TODO(andy) - LLM without multiprocessing.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      enable_multiprocessing: enable workers in separate process(es)
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Instantiate LLM; make request to complete any deferred
+        # initialization; then delete instance
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+        if send_one_request:
+            llm.generate("Hello my name is",
+                         sampling_params=SamplingParams(max_tokens=1))
+        del llm
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
new file mode 100644
index 0000000000000..9fedbe4f9a01a
--- /dev/null
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle an Error in model forward and shutdown."""
+
+import asyncio
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, AsyncEngineArgs, SamplingParams
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineDeadError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception after 10 calls."""
+    NUMBER_OF_GOOD_PASSES = 10
+
+    if not hasattr(self, "num_calls"):
+        self.num_calls = 0
+
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES
+            and get_tensor_model_parallel_rank() == 0):
+        raise Exception("Simulated illegal memory access on Rank 0!")
+    self.num_calls += 1
+
+    return self.model(*args, **kwargs)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                         enable_multiprocessing: bool, model: str) -> None:
+    """Test that LLM propagates a forward pass error and frees memory.
+    TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
+    and >1 rank
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        m.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+
+        with pytest.raises(
+                EngineDeadError if enable_multiprocessing else Exception):
+            llm.generate("Hello my name is Robert and I")
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
new file mode 100644
index 0000000000000..0fe48da475c6a
--- /dev/null
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test error handling in Processor. Should not impact other reqs."""
+
+import asyncio
+
+import pytest
+
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineGenerateError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_processor_error(model: str) -> None:
+    """Test that AsyncLLM propagates a processor error.
+    Test empty tokens prompt (failure) and non-empty prompt (no failure.)
+    AsyncLLM always uses an MP client.
+    """
+    engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        # [] is not allowed and will raise a ValueError in Processor.
+        generator = async_llm.generate(TokensPrompt([]),
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should have get an EngineGenerateError.
+    for output in outputs:
+        with pytest.raises(EngineGenerateError):
+            raise output
+
+    # AsyncLLM should be errored.
+    assert not async_llm.errored
+
+    # This should be no problem.
+    EXPECTED_TOKENS = 5
+    outputs = []
+    async for out in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=EXPECTED_TOKENS,
+                output_kind=RequestOutputKind.DELTA)):
+        outputs.append(out)
+
+    generated_tokens = []
+    for out in outputs:
+        generated_tokens.extend(out.outputs[0].token_ids)
+    assert len(generated_tokens) == EXPECTED_TOKENS
+
+    async_llm.shutdown()
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
new file mode 100644
index 0000000000000..1bba19102ec61
--- /dev/null
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_method(self, *args, **kwargs):
+    """Evil method that raises an exception."""
+
+    if get_tensor_model_parallel_rank() == 0:
+        raise Exception("Simulated Error in startup!")
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_async_llm_startup_error(monkeypatch, model: str,
+                                 tensor_parallel_size: int,
+                                 failing_method: str) -> None:
+    """Test that AsyncLLM propagates an __init__ error & frees memory.
+    Test profiling (forward()) and load weights failures.
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Confirm we get an exception.
+    with pytest.raises(Exception, match="initialization failed"):
+        _ = AsyncLLM.from_engine_args(engine_args)
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
+                           enable_multiprocessing: bool,
+                           failing_method: str) -> None:
+    """Test that LLM propagates an __init__ error and frees memory.
+    Test profiling (forward()) and load weights failures.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if model != "meta-llama/Llama-3.2-1B":
+        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+        with pytest.raises(
+                Exception,
+                match="initialization failed"
+                if enable_multiprocessing else "Simulated Error in startup!"):
+            _ = LLM(model=model,
+                    enforce_eager=True,
+                    tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
new file mode 100644
index 0000000000000..8f7c0380d407f
--- /dev/null
+++ b/tests/v1/shutdown/utils.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shutdown test utils"""
+
+SHUTDOWN_TEST_TIMEOUT_SEC = 120
+SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
new file mode 100644
index 0000000000000..f577fb4ab3295
--- /dev/null
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test whether spec decoding handles the max model length properly."""
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+_PROMPTS = [
+    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
+    "Repeat the following sentence 10 times: Consistency is key to mastering any skill.",  # noqa: E501
+    "Who won the Turing Award in 2018, and for what contribution? Describe in detail.",  # noqa: E501
+]
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_ngram_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="facebook/opt-125m",
+            max_model_len=100,
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_eagle_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "eagle",
+                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+            max_model_len=100,
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index a81b4897e5d65..50548219fff04 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                 _find_subarray_kmp,
                                                 _kmp_lps_array)
@@ -39,50 +40,50 @@ def test_find_subarray_kmp():
 
 
 def test_ngram_proposer():
-    proposer = NgramProposer()
+
+    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
+        # Dummy model config. Just to set max_model_len.
+        model_config = ModelConfig(model="facebook/opt-125m",
+                                   task="generate",
+                                   max_model_len=100,
+                                   tokenizer="facebook/opt-125m",
+                                   tokenizer_mode="auto",
+                                   dtype="auto",
+                                   seed=None,
+                                   trust_remote_code=False)
+        return NgramProposer(
+            vllm_config=VllmConfig(model_config=model_config,
+                                   speculative_config=SpeculativeConfig.
+                                   from_dict({
+                                       "prompt_lookup_min": min_n,
+                                       "prompt_lookup_max": max_n,
+                                       "num_speculative_tokens": k,
+                                       "method": "ngram",
+                                   })))
 
     # No match.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 5]),
-        min_n=2,
-        max_n=2,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
     assert result is None
 
     # No match for 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=4,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
-    result = proposer.propose(
-        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(3, 4, 2).propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=2,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 4,
+        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 3aa86cbec533c..337df4517ae99 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 
@@ -13,18 +13,6 @@ def unsupported_string_schemas():
             "type": "string",
             "pattern": "^[a-zA-Z]+$"
         },
-        {
-            "type": "string",
-            "enum": ["active", "inactive", "pending"]
-        },
-        {
-            "type": "string",
-            "minLength": 1
-        },
-        {
-            "type": "string",
-            "maxLength": 100
-        },
         {
             "type": "string",
             "format": "email"
@@ -164,6 +152,18 @@ def supported_schema():
                     "type": "number"
                 }
             },
+            "car_type": {
+                "type": "string",
+                "enum": ["sedan", "suv", "truck"]
+            },
+            "short_description": {
+                "type": "string",
+                "maxLength": 50
+            },
+            "long_description": {
+                "type": "string",
+                "minLength": 50
+            },
             "address": {
                 "type": "object",
                 "properties": {
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
new file mode 100644
index 0000000000000..ce4c4d198db58
--- /dev/null
+++ b/tests/v1/test_async_llm_dp.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from contextlib import ExitStack
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+
+engine_args = AsyncEngineArgs(
+    model="ibm-research/PowerMoE-3b",
+    enforce_eager=True,
+    disable_log_requests=True,
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+)
+
+if not current_platform.supports_v1(engine_args.create_model_config()):
+    pytest.skip(reason="Requires V1-supporting platform.",
+                allow_module_level=True)
+
+
+async def generate(engine: AsyncLLM,
+                   request_id: str,
+                   prompt: PromptType,
+                   output_kind: RequestOutputKind,
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_load(output_kind: RequestOutputKind):
+
+    with ExitStack() as after:
+
+        prompt = "This is a test of data parallel"
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        # the engines only synchronize stopping every N steps so
+        # allow a small amount of time here.
+        for _ in range(10):
+            if not core_client.engines_running:
+                break
+            await asyncio.sleep(0.5)
+
+        assert not core_client.engines_running
+        assert not core_client.reqs_in_flight
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index d74a96fbfa02f..1448641f6a570 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
-                speculative_model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
             ).create_engine_config()
 
         with pytest.raises(NotImplementedError):
@@ -102,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
         assert envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
 
-        # Should fall back to V0 for experimental config.
-        _ = AsyncEngineArgs(
-            model=MODEL,
-            enable_lora=True,
-        ).create_engine_config()
-        assert not envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
         # Should fall back to V0 for supported model.
         _ = AsyncEngineArgs(
             model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
@@ -123,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
             m.delenv("VLLM_USE_V1")
 
         # Should default to V1 for supported config.
-        model = LLM(MODEL, enforce_eager=True)
+        model = LLM(MODEL, enforce_eager=True, enable_lora=True)
         print(model.generate("Hello my name is"))
         assert hasattr(model.llm_engine, "engine_core")
         m.delenv("VLLM_USE_V1")
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
new file mode 100644
index 0000000000000..df9832fc4e480
--- /dev/null
+++ b/tests/v1/test_serial_utils.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import UserDict
+from dataclasses import dataclass
+from typing import Optional
+
+import msgspec
+import numpy as np
+import torch
+
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+
+class UnrecognizedType(UserDict):
+
+    def __init__(self, an_int: int):
+        super().__init__()
+        self.an_int = an_int
+
+
+@dataclass
+class MyType:
+    tensor1: torch.Tensor
+    a_string: str
+    list_of_tensors: list[torch.Tensor]
+    numpy_array: np.ndarray
+    unrecognized: UnrecognizedType
+    small_f_contig_tensor: torch.Tensor
+    large_f_contig_tensor: torch.Tensor
+    small_non_contig_tensor: torch.Tensor
+    large_non_contig_tensor: torch.Tensor
+
+
+def test_encode_decode():
+    """Test encode/decode loop with zero-copy tensors."""
+
+    obj = MyType(
+        tensor1=torch.randint(low=0,
+                              high=100,
+                              size=(1024, ),
+                              dtype=torch.int32),
+        a_string="hello",
+        list_of_tensors=[
+            torch.rand((1, 10), dtype=torch.float32),
+            torch.rand((3, 5, 4000), dtype=torch.float64),
+            torch.tensor(1984),  # test scalar too
+            # Make sure to test bf16 which numpy doesn't support.
+            torch.rand((3, 5, 1000), dtype=torch.bfloat16),
+            torch.tensor([float("-inf"), float("inf")] * 1024,
+                         dtype=torch.bfloat16),
+        ],
+        numpy_array=np.arange(512),
+        unrecognized=UnrecognizedType(33),
+        small_f_contig_tensor=torch.rand(5, 4).t(),
+        large_f_contig_tensor=torch.rand(1024, 4).t(),
+        small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
+        large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
+    )
+
+    encoder = MsgpackEncoder(size_threshold=256)
+    decoder = MsgpackDecoder(MyType)
+
+    encoded = encoder.encode(obj)
+
+    # There should be the main buffer + 4 large tensor buffers
+    # + 1 large numpy array. "large" is <= 512 bytes.
+    # The two small tensors are encoded inline.
+    assert len(encoded) == 8
+
+    decoded: MyType = decoder.decode(encoded)
+
+    assert_equal(decoded, obj)
+
+    # Test encode_into case
+
+    preallocated = bytearray()
+
+    encoded2 = encoder.encode_into(obj, preallocated)
+
+    assert len(encoded2) == 8
+    assert encoded2[0] is preallocated
+
+    decoded2: MyType = decoder.decode(encoded2)
+
+    assert_equal(decoded2, obj)
+
+
+class MyRequest(msgspec.Struct):
+    mm: Optional[list[MultiModalKwargs]]
+
+
+def test_multimodal_kwargs():
+    d = {
+        "foo":
+        torch.zeros(20000, dtype=torch.float16),
+        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
+        "baz": [
+            torch.rand((256), dtype=torch.float16),
+            [
+                torch.rand((1, 12), dtype=torch.float32),
+                torch.rand((3, 5, 7), dtype=torch.float64),
+            ], [torch.rand((4, 4), dtype=torch.float16)]
+        ],
+    }
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest(mm=[MultiModalKwargs(d)])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 6
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 44559, +-20 for minor changes
+    assert total_len >= 44539 and total_len <= 44579
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    assert all(nested_equal(d[k], decoded[k]) for k in d)
+
+
+def test_multimodal_items_by_modality():
+    e1 = MultiModalFieldElem("audio", "a0",
+                             torch.zeros(1000, dtype=torch.bfloat16),
+                             MultiModalBatchedField())
+    e2 = MultiModalFieldElem(
+        "video",
+        "v0",
+        [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
+        MultiModalBatchedField(),
+    )
+    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalSharedField(4))
+    e4 = MultiModalFieldElem("image", "i1", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalBatchedField())
+    audio = MultiModalKwargsItem.from_elems([e1])
+    video = MultiModalKwargsItem.from_elems([e2])
+    image = MultiModalKwargsItem.from_elems([e3, e4])
+    mm = MultiModalKwargs.from_items([audio, video, image])
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest([mm])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 8
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 14255, +-20 for minor changes
+    assert total_len >= 14235 and total_len <= 14275
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+
+    # check all modalities were recovered and do some basic sanity checks
+    assert len(decoded.modalities) == 3
+    images = decoded.get_items("image")
+    assert len(images) == 1
+    assert len(images[0].items()) == 2
+    assert list(images[0].keys()) == ["i0", "i1"]
+
+    # check the tensor contents and layout in the main dict
+    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+
+
+def nested_equal(a: NestedTensors, b: NestedTensors):
+    if isinstance(a, torch.Tensor):
+        return torch.equal(a, b)
+    else:
+        return all(nested_equal(x, y) for x, y in zip(a, b))
+
+
+def assert_equal(obj1: MyType, obj2: MyType):
+    assert torch.equal(obj1.tensor1, obj2.tensor1)
+    assert obj1.a_string == obj2.a_string
+    assert all(
+        torch.equal(a, b)
+        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+    assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
+    assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
+    assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
+    assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor,
+                       obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor,
+                       obj2.large_non_contig_tensor)
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 417483853916b..a4571a554572c 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -22,6 +22,7 @@ MODELS = [
 ]
 
 TENSOR_PARALLEL_SIZES = [1]
+MAX_NUM_REQS = [16, 1024]
 
 # TODO: Enable when CI/CD will have a multi-tpu instance
 # TENSOR_PARALLEL_SIZES = [1, 4]
@@ -31,15 +32,15 @@ TENSOR_PARALLEL_SIZES = [1]
                     reason="This is a basic test for TPU only")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
-def test_models(
+@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
+def test_basic(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
-    enforce_eager: bool,
     tensor_parallel_size: int,
+    max_num_seqs: int,
 ) -> None:
     prompt = "The next numbers of the sequence " + ", ".join(
         str(i) for i in range(1024)) + " are:"
@@ -50,12 +51,15 @@ def test_models(
 
         with vllm_runner(
                 model,
+                # Note: max_num_batched_tokens == 1024 is needed here to
+                # actually test chunked prompt
+                max_num_batched_tokens=1024,
                 max_model_len=8192,
-                enforce_eager=enforce_eager,
                 gpu_memory_utilization=0.7,
-                max_num_seqs=16,
+                max_num_seqs=max_num_seqs,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
         output = vllm_outputs[0][1]
-        assert "1024" in output
+
+        assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
new file mode 100644
index 0000000000000..eb62e0e4b201a
--- /dev/null
+++ b/tests/v1/tpu/test_multimodal.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+
+from vllm import envs
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.platforms import current_platform
+
+from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...utils import RemoteOpenAIServer
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
+                                                                        str]):
+
+    def whats_in_this_image_msg(b64):
+        return [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{b64}"
+                    },
+                },
+            ],
+        }]
+
+    server_args = [
+        "--max-model-len",
+        "1024",
+        "--max-num-seqs",
+        "16",
+        "--gpu-memory-utilization",
+        "0.95",
+        "--trust-remote-code",
+        "--max-num-batched-tokens",
+        "576",
+        # NOTE: max-num-batched-tokens>=mm_item_size
+        "--disable_chunked_mm_input",
+        "--chat-template",
+        "examples/template_llava.jinja"
+    ]
+
+    # Server will pre-compile on first startup (takes a long time).
+    with RemoteOpenAIServer(model_name, server_args,
+                            max_wait_seconds=600) as remote_server:
+        client: openai.AsyncOpenAI = remote_server.get_async_client()
+
+        # Other requests now should be much faster
+        for image_url in TEST_IMAGE_URLS:
+            image_base64 = base64_encoded_image[image_url]
+            chat_completion_from_base64 = await client.chat.completions\
+                .create(
+                model=model_name,
+                messages=whats_in_this_image_msg(image_base64),
+                max_completion_tokens=24,
+                temperature=0.0)
+            result = chat_completion_from_base64
+            assert result
+            choice = result.choices[0]
+            assert choice.finish_reason == "length"
+
+            message = choice.message
+            message = result.choices[0].message
+            assert message.content is not None and len(message.content) >= 10
+            assert message.role == "assistant"
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
new file mode 100644
index 0000000000000..8faa5270b5930
--- /dev/null
+++ b/tests/v1/tpu/test_pallas.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+from unittest.mock import ANY, patch
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
+                                               PallasMetadata)
+
+
+def test_ragged_paged_attention():
+    # We verify that the kernel inputs such as sliding_window, etc. are passed
+    # in from the model correctly.
+    # The correctness of the paged attention kernel is tested in the kernel
+    # library.
+    num_heads = 4
+    head_size = 128
+    scale = 1.0
+    num_kv_heads = 4
+    sliding_window = 128
+    logits_soft_cap = 50.0
+    attn_impl = PallasAttentionBackendImpl(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=sliding_window,
+        kv_cache_dtype="auto",
+        logits_soft_cap=logits_soft_cap,
+        attn_type=AttentionType.DECODER,
+    )
+
+    class FakeAttentionLayer:
+        _k_scale_float: float
+        _v_scale_float: float
+
+    layer = FakeAttentionLayer()
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    num_tokens = 16
+    num_blocks = 1024
+    block_size = 16
+    query = torch.zeros(num_tokens, num_heads * head_size)
+    key = torch.zeros(num_tokens, num_kv_heads * head_size)
+    value = torch.zeros(num_tokens, num_kv_heads * head_size)
+    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
+    slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
+    max_num_reqs = 8
+    max_num_blocks_per_req = 8
+    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                               dtype=torch.int32)
+    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    query_lens = [1] * max_num_reqs
+    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                dtype=torch.int32),
+                                   dim=0,
+                                   dtype=torch.int32)
+    num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
+    attn_metadata = PallasMetadata(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        query_start_loc=query_start_loc,
+        num_seqs=num_seqs,
+    )
+
+    with patch("torch.ops.xla.ragged_paged_attention"
+               ) as mock_ragged_paged_attention:
+        attn_impl.forward(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        mock_ragged_paged_attention.assert_called_once_with(
+            ANY,  # query
+            ANY,  # kv_cache
+            ANY,  # context_lens
+            ANY,  # block_tables
+            ANY,  # query_start_loc
+            ANY,  # num_seqs
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
+            use_kernel=True,
+            sm_scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=logits_soft_cap,
+        )
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
new file mode 100644
index 0000000000000..94a1da88a2f06
--- /dev/null
+++ b/tests/v1/tpu/test_perf.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (vllm CI/CD)
+        expected_avg_time=1.4,
+        err_tol=0.30,
+
+        # (TODO: There is no v6e in CI/CD currently)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e
+        # expected_avg_time=1.5,
+        # err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic performance test for TPU only")
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(params.model,
+                              tokenizer_mode="auto",
+                              trust_remote_code=True)
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(0,
+                                             tokenizer.vocab_size,
+                                             size=params.prefix_len).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len))
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=params.decode_len,
+                                         temperature=1.0,
+                                         min_p=0.0)
+
+        with vllm_runner(params.model,
+                         max_num_batched_tokens=MAX_MODEL_LEN,
+                         max_model_len=MAX_MODEL_LEN,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         gpu_memory_utilization=GPU_UTIL,
+                         enforce_eager=False,
+                         tensor_parallel_size=1) as vllm_model:
+            print("  -- Warmup / Compile")
+            for i in range(NUM_WARMUPS):
+                _ = vllm_model.generate(prompts, sampling_params)
+
+            print("  -- Benchmarking... ")
+            times = []
+            for i in range(NUM_RUNS):
+                start_time = time.time()
+                _ = vllm_model.generate(prompts, sampling_params)
+                times.append(time.time() - start_time)
+
+            avg_time = sum(times) / len(times)
+
+            print("  -- avg_time = {}".format(avg_time))
+            print("  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol))
+            diff = avg_time - params.expected_avg_time
+            ok = diff < params.err_tol
+            if diff < -params.err_tol:
+                print("  !! WARNING !! Performance has improved by {}, "
+                      "it may be necessary to fine-tune the "
+                      "expected_avg_time = {}".format(
+                          -diff, params.expected_avg_time))
+
+            assert ok, " !! ERROR !! Regression detected"
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 4e5a57bee3275..046d3e404e4f4 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-import tempfile
-from time import time
+import random
 
 import pytest
 
@@ -15,60 +14,6 @@ if not envs.VLLM_USE_V1:
     )
 
 
-@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
-def test_sampler_compilation(model_name: str, monkeypatch):
-    """
-    Check that no recompilation happens despite changing sampling parameters.
-    We can't read XLA metrics from the engine process, hence we measure time.  
-    """
-    with tempfile.TemporaryDirectory() as temp_dir:
-        monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
-        # Compiling model init may still take some time, enforce_eager to skip.
-        llm = LLM(model_name,
-                  enforce_eager=True,
-                  max_num_seqs=16,
-                  max_model_len=1024,
-                  gpu_memory_utilization=0.5)
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-        ]
-        # First inference should be slow
-        sampling_params = SamplingParams(
-            temperature=0.7,
-            # top_p=0.6, # TODO too slow!
-            top_k=10,
-            min_p=0.2,
-            max_tokens=16)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run1 = time() - s
-
-        # Second request with different params, but for which we
-        # compiled for in previous eager iteration.
-        sampling_params = SamplingParams(temperature=0.1,
-                                         top_k=12,
-                                         min_p=0.8,
-                                         max_tokens=24)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run2 = time() - s
-        # Much faster after compiling
-        assert run1 * 0.1 > run2
-        print("TIMES", run1, run2)
-
-        # Third request with min_p set to "None". It will not trigger
-        # recompilation as a default 0 value will be used.
-        sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run3 = time() - s
-        assert run1 * 0.1 > run3
-        print("TIMES", run1, run3)
-
-
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This test needs a TPU")
@@ -77,13 +22,11 @@ def test_sampler_different(model_name: str):
     Test significantly different sampling params to assert the model produces 
     different results.
     """
-    llm = LLM(
-        model_name,
-        enforce_eager=True,
-        max_num_seqs=1,
-        max_model_len=64,
-        # TODO: setting to 0.5 or it will go OOM
-        gpu_memory_utilization=0.5)
+    llm = LLM(model_name,
+              enforce_eager=True,
+              max_num_seqs=1,
+              max_model_len=512,
+              max_num_batched_tokens=512)
     prompts = [
         "Write a short story about a robot that dreams for the first time."
     ]
@@ -93,3 +36,28 @@ def test_sampler_different(model_name: str):
     sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
     output2 = llm.generate(prompts, sampling_params)
     assert output[0].outputs[0].text != output2[0].outputs[0].text
+
+    with pytest.raises(ValueError):
+        # Unsupported `seed` param.
+        sampling_params = SamplingParams(temperature=0.3, seed=42)
+        output2 = llm.generate(prompts, sampling_params)
+
+    # Batch-case with TopK/P
+    for B in [4, 16]:
+        p = prompts * B
+        sampling_params = [
+            SamplingParams(
+                temperature=0.1,
+                min_p=0.8,
+                max_tokens=64,
+                # Vary number of ks
+                top_k=random.randint(4, 12),
+                top_p=random.random()) for _ in range(B)
+        ]
+        # Make sure first two reqs have the same K/P
+        sampling_params[0] = sampling_params[1]
+        output = llm.generate(p, sampling_params)
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
new file mode 100644
index 0000000000000..ff9217f8f3cab
--- /dev/null
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
+                                                  apply_top_k_top_p_tpu)
+
+if not current_platform.is_tpu():
+    pytest.skip("This test needs a TPU.", allow_module_level=True)
+import torch_xla.core.xla_model as xm
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+TOLERANCE = 1e-6
+
+
+def test_topk_equivalence_to_native_impl():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+
+        # Random top-k values between 1 and 10.
+        k = torch.randint(1, 10, (BATCH_SIZE, ))
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
+                       VOCAB_SIZE)
+
+        result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
+
+        result_native = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+        assert torch.allclose(result_native, result_tpu)
+
+
+def test_topp_result_sums_past_p():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+        probs = logits.softmax(dim=-1)
+
+        # Random top-p values between 0 and 1.
+        p = torch.rand((BATCH_SIZE, ))
+
+        # Set p=1 for ~50% of requests in the batch (top-p disabled).
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+
+        no_op_k = torch.tensor([VOCAB_SIZE])
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                              k=no_op_k,
+                                              p=p)
+
+        # Verify that the masked logit's probability sums to at least p.
+        probs.masked_fill_(logits_masked.isinf(), 0)
+        masked_prob_sum = probs.sum(dim=-1)
+
+        xm.mark_step()
+
+    # Perform assertion on CPU.
+    assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu()))
+
+
+def test_topp_basic():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Expect the smallest elements to be dropped.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_topp_select_all():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([1.0, 1.0]))
+
+        xm.mark_step()
+
+    assert torch.allclose(logits.cpu(), result.cpu())
+
+
+def test_topp_with_ties():
+    with torch.device(xm.xla_device()):
+        # Input has multiple math.log(0.3).
+        logits = torch.tensor(
+            [[math.log(0.3),
+              math.log(0.3),
+              math.log(0.3),
+              math.log(0.1)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([4]),
+                                       p=torch.tensor([0.2]))
+
+        xm.mark_step()
+
+    # All tie values are included in the top-p set. Tie breaking is left
+    # to be done during final sampling (all tie tokens have equal
+    # probability of being chosen).
+    expected_result = logits.clone().cpu()
+    expected_result[0, 3] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_both_topk_topp():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        # Set k=1 for the first batch.
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([1, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Since for the first batch k=1, expect only the largest element gets
+    # selected.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[0, 1] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index d5f812ed4d543..5db6060935691 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,10 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
-                                             _get_padded_token_len,
-                                             _get_paddings)
+from vllm.v1.worker.tpu_model_runner import (
+    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
 # Mock torch_xla module since it may not be available in the test environments
 torch_xla_patcher = mock.patch.dict(
@@ -113,12 +112,6 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
-def _is_sampling_metadata_changed(model_runner,
-                                  sampling_metadata_before: SamplingMetadata):
-    return model_runner.input_batch.sampling_metadata is not (
-        sampling_metadata_before)
-
-
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     req_index = model_runner.input_batch.req_id_to_index[req_id]
     block_table = model_runner.input_batch.block_table
@@ -136,10 +129,8 @@ def test_update_states_new_request(model_runner):
     # new req
     scheduler_output = _schedule_new_request(req_id)
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
 
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -170,9 +161,7 @@ def test_update_states_request_finished(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert not _is_req_added(model_runner, req_id)
     assert not _is_req_scheduled(model_runner, req_id)
 
@@ -229,9 +218,7 @@ def test_update_states_request_resumed(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -262,9 +249,7 @@ def test_update_states_no_changes(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -299,8 +284,7 @@ def test_update_states_request_unscheduled(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    model_runner._update_states(scheduler_output)
 
     assert _is_req_added(model_runner, req_ids[0])
     assert _is_req_scheduled(model_runner, req_ids[0])
@@ -310,18 +294,51 @@ def test_update_states_request_unscheduled(model_runner):
 
 
 def test_get_paddings():
+    # Bucketed padding
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_paddings(min_token_size, max_token_size,
-                                    padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+
+    # Bucketed padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+
+    # Exponential padding.
+    max_token_size, padding_gap = 1024, 0
+    expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+    # Exponential padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 256, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
     assert actual_paddings == expected_paddings
 
 
 def test_get_padded_token_len():
     min_token_size, max_token_size, padding_gap = 16, 512, 64
-    paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
+    paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert _get_padded_token_len(paddings, 1) == 16
     assert _get_padded_token_len(paddings, 16) == 16
     assert _get_padded_token_len(paddings, 20) == 32
     assert _get_padded_token_len(paddings, 300) == 320
     assert _get_padded_token_len(paddings, 512) == 512
+
+
+def test_get_padded_num_reqs_with_upper_limit():
+    assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
+    assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
+    assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
+    assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
+
+
+def test_get_req_paddings():
+    assert _get_req_paddings(1, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index 7efb3cabc64fe..59ce400385ebb 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -18,5 +18,5 @@ if ! [ -x "$(command -v shellcheck)" ]; then
     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
 fi
 
-# TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
+# TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
diff --git a/tools/update-dockerfile-graph.sh b/tools/update-dockerfile-graph.sh
new file mode 100755
index 0000000000000..98cff47d17a03
--- /dev/null
+++ b/tools/update-dockerfile-graph.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Update Dockerfile dependency graph when docker/Dockerfile changes.
+# This script is designed to be used as a pre-commit hook.
+
+set -euo pipefail
+
+# Check if docker/Dockerfile is staged for commit
+if git diff --cached --name-only | grep -q "^docker/Dockerfile$"; then
+  echo "docker/Dockerfile has changed, attempting to update dependency graph..."
+
+  # Check if Docker is installed and running
+  if ! command -v docker &> /dev/null; then
+    echo "Warning: Docker command not found. Skipping Dockerfile graph update."
+    echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
+    exit 0
+  fi
+  if ! docker info &> /dev/null; then
+    echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
+    echo "Please start Docker to automatically update the graph."
+    exit 0
+  fi
+
+  # Define the target file path
+  TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
+
+  # Ensure target directory exists
+  mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
+
+  # Store old image hash in a variable if the file exists
+  OLD_HASH=""
+  if [ -f "$TARGET_GRAPH_FILE" ]; then
+    OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  fi
+  
+  # Generate Dockerfile graph
+  echo "Running dockerfilegraph tool..."
+  docker run \
+    --rm \
+    --user "$(id -u):$(id -g)" \
+    --workdir /workspace \
+    --volume "$(pwd)":/workspace \
+    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+    --output png \
+    --dpi 200 \
+    --max-label-length 50 \
+    --filename docker/Dockerfile \
+    --legend
+  
+  echo "Finding generated PNG file..."
+  # Check for Dockerfile.png in the root directory (most likely location)
+  if [ -f "./Dockerfile.png" ]; then
+    echo "Found generated file at: ./Dockerfile.png"
+    mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
+  else
+    # Try to find it elsewhere
+    DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
+    
+    if [ -n "$DOCKERFILE_PNG" ]; then
+      echo "Found generated file at: $DOCKERFILE_PNG"
+      mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
+    else
+      echo "Error: Could not find the generated PNG file"
+      find . -name "*.png" -type f -mmin -5
+      exit 1
+    fi
+  fi
+  
+  # Check if the graph has changed
+  NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  if [ "$NEW_HASH" != "$OLD_HASH" ]; then
+    echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
+    exit 1
+  else
+    echo "No changes in graph detected."
+  fi
+fi
+
+exit 0 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 457780824c743..52022fb8f0168 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -4,9 +4,10 @@
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
 
-import os
-
-import torch
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import vllm.env_override  # isort:skip  # noqa: F401
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -23,19 +24,6 @@ from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-# set some common config/environment variables that should be set
-# for all processes created by vllm and all processes
-# that interact with vllm workers.
-# they are executed whenever `import vllm` is called.
-
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-# see https://github.com/vllm-project/vllm/issues/10480
-os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-# see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
-
 __all__ = [
     "__version__",
     "__version_tuple__",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index dc07bad4680f9..11297d3b9f548 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -110,6 +110,7 @@ def paged_attention_rocm(
     scale: float,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
+    query_start_loc: Optional[torch.Tensor],
     block_size: int,
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
@@ -120,8 +121,9 @@ def paged_attention_rocm(
     torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
                                       key_cache, value_cache, num_kv_heads,
                                       scale, block_tables, seq_lens,
-                                      block_size, max_seq_len, alibi_slopes,
-                                      kv_cache_dtype, k_scale, v_scale)
+                                      query_start_loc, block_size, max_seq_len,
+                                      alibi_slopes, kv_cache_dtype, k_scale,
+                                      v_scale)
 
 
 def mla_decode_kvcache_cpu(
@@ -136,6 +138,17 @@ def mla_decode_kvcache_cpu(
                                         block_tables, seq_lens)
 
 
+# merge attn states ops
+def merge_attn_states(output: torch.Tensor,
+                      prefix_output: torch.Tensor,
+                      prefix_lse: torch.Tensor,
+                      suffix_output: torch.Tensor,
+                      suffix_lse: torch.Tensor,
+                      output_lse: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
+                                   prefix_lse, suffix_output, suffix_lse)
+
+
 # pos encoding ops
 def rotary_embedding(
     positions: torch.Tensor,
@@ -436,9 +449,12 @@ if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 if hasattr(torch.ops._C, "ggml_dequantize"):
 
     @register_fake("_C::ggml_dequantize")
-    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
-                              m: torch.SymInt,
-                              n: torch.SymInt) -> torch.Tensor:
+    def _ggml_dequantize_fake(
+            W: torch.Tensor,
+            quant_type: int,
+            m: torch.SymInt,
+            n: torch.SymInt,
+            dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         return torch.empty((m, n), dtype=torch.float16, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_vec_a8")
@@ -587,6 +603,9 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
         cuda_device_capability)
 
 
+def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -677,6 +696,56 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
+def get_cutlass_moe_mm_data(
+        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
+        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
+        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
+        num_experts: int, n: int, k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token-expert mapping) and uses it to
+    compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation after the input is sorted with
+                      input_permutation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    - input_permutation: Permutation that must be used to shuffle the input
+                         before executing the MMs.
+    - output_permutation: Permutation that must be used to shuffle the output
+                          after executing the MMs.
+    """
+    torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
+                                         problem_sizes1, problem_sizes2,
+                                         input_permutation, output_permutation,
+                                         num_experts, n, k)
+
+
+def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
+                   b_tensors: torch.Tensor, a_scales: torch.Tensor,
+                   b_scales: torch.Tensor, expert_offsets: torch.Tensor,
+                   problem_sizes: torch.Tensor, a_strides: torch.Tensor,
+                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+    """
+    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
+    The function executes fp8-quantized OUT = AB matrix multiplication.
+
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
+    """
+    torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors, a_scales,
+                                b_scales, expert_offsets, problem_sizes,
+                                a_strides, b_strides, c_strides)
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
@@ -1044,9 +1113,9 @@ def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # gguf
-def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int,
-                    n: int) -> torch.Tensor:
-    return torch.ops._C.ggml_dequantize(W, quant_type, m, n)
+def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
+                    dtype: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
 
 
 def ggml_mul_mat_vec_a8(
@@ -1127,6 +1196,26 @@ def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
                                     ssm_states, pad_slot_id)
 
 
+# ROCm skinny gemms
+def LLMM1(a: torch.Tensor, b: torch.Tensor,
+          rows_per_block: int) -> torch.Tensor:
+    return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
+
+
+def wvSplitK(a: torch.Tensor, b: torch.Tensor, cu_count: int) -> torch.Tensor:
+    return torch.ops._rocm_C.wvSplitK(a, b, cu_count)
+
+
+def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
+              scale_a: torch.Tensor, scale_b: torch.Tensor,
+              cu_count: int) -> torch.Tensor:
+    out = torch.empty((b.shape[0], a.shape[0]),
+                      dtype=out_dtype,
+                      device=b.device)
+    torch.ops._rocm_C.wvSplitKQ(a, b, out, scale_a, scale_b, cu_count)
+    return out
+
+
 # moe
 def moe_sum(input: torch.Tensor, output: torch.Tensor):
     torch.ops._moe_C.moe_sum(input, output)
@@ -1171,11 +1260,34 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
 
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
-                 gating_output: float) -> None:
+                 gating_output: torch.Tensor) -> None:
     torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
                                   token_expert_indicies, gating_output)
 
 
+def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
+                          b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                          b_qzeros: Optional[torch.Tensor],
+                          g_idx: Optional[torch.Tensor],
+                          perm: Optional[torch.Tensor],
+                          workspace: torch.Tensor,
+                          sorted_token_ids: torch.Tensor,
+                          expert_ids: torch.Tensor,
+                          num_tokens_past_padded: torch.Tensor,
+                          topk_weights: torch.Tensor, moe_block_size: int,
+                          top_k: int, mul_topk_weights: bool, is_ep: bool,
+                          b_q_type: ScalarType, size_m: int, size_n: int,
+                          size_k: int, is_k_full: bool, use_atomic_add: bool,
+                          use_fp32_reduce: bool,
+                          is_zp_float: bool) -> torch.Tensor:
+    return torch.ops._moe_C.moe_wna16_marlin_gemm(
+        input, output, b_qweight, b_scales, b_qzeros, g_idx, perm, workspace,
+        sorted_token_ids, expert_ids, num_tokens_past_padded, topk_weights,
+        moe_block_size, top_k, mul_topk_weights, is_ep, b_q_type.id, size_m,
+        size_n, size_k, is_k_full, use_atomic_add, use_fp32_reduce,
+        is_zp_float)
+
+
 if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
 
     @register_fake("_moe_C::marlin_gemm_moe")
@@ -1194,6 +1306,29 @@ if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
                            dtype=a.dtype,
                            device=a.device)
 
+    @register_fake("_moe_C::moe_wna16_marlin_gemm")
+    def moe_wna16_marlin_gemm_fake(input: torch.Tensor,
+                                   output: Optional[torch.Tensor],
+                                   b_qweight: torch.Tensor,
+                                   b_scales: torch.Tensor,
+                                   b_qzeros: Optional[torch.Tensor],
+                                   g_idx: Optional[torch.Tensor],
+                                   perm: Optional[torch.Tensor],
+                                   workspace: torch.Tensor,
+                                   sorted_token_ids: torch.Tensor,
+                                   expert_ids: torch.Tensor,
+                                   num_tokens_past_padded: torch.Tensor,
+                                   topk_weights: torch.Tensor,
+                                   moe_block_size: int, top_k: int,
+                                   mul_topk_weights: bool, is_ep: bool,
+                                   b_q_type: ScalarType, size_m: int,
+                                   size_n: int, size_k: int, is_k_full: bool,
+                                   use_atomic_add: bool, use_fp32_reduce: bool,
+                                   is_zp_float: bool) -> torch.Tensor:
+        return torch.empty((size_m * top_k, size_n),
+                           dtype=input.dtype,
+                           device=input.device)
+
 
 def reshape_and_cache(
     key: torch.Tensor,
@@ -1284,9 +1419,9 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
 
 # custom ar
 def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
-                   rank: int, full_nvlink: bool) -> int:
+                   rank: int, fully_connected: bool) -> int:
     return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
-                                                 full_nvlink)
+                                                 fully_connected)
 
 
 def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
@@ -1316,6 +1451,18 @@ def register_graph_buffers(fa: int, handles: list[list[int]],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
+def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
+    return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
+
+
+def open_mem_handle(mem_handle: torch.Tensor):
+    return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
+
+
+def free_shared_buffer(ptr: int) -> None:
+    torch.ops._C_custom_ar.free_shared_buffer(ptr)
+
+
 def get_flash_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_heads_per_head_k: int,
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e45e1a65f8905..133e18b68e25b 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Literal
+from typing import Literal, Optional
 
 import cv2
 import numpy as np
@@ -10,10 +10,15 @@ import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.video import sample_frames_from_video
+from vllm.utils import PlaceholderModule
 
 from .base import get_cache_dir
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 @lru_cache
 def download_video_asset(filename: str) -> str:
@@ -43,14 +48,19 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frames = []
-    for i in range(total_frames):
-        ret, frame = cap.read()
-        if ret:
-            frames.append(frame)
-    cap.release()
+
+    num_frames = num_frames if num_frames > 0 else total_frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for idx in range(total_frames):
+        ok = cap.grab()  # next img
+        if not ok:
+            break
+        if idx in frame_indices:  # only decompress needed
+            ret, frame = cap.retrieve()
+            if ret:
+                frames.append(frame)
 
     frames = np.stack(frames)
-    frames = sample_frames_from_video(frames, num_frames)
     if len(frames) < num_frames:
         raise ValueError(f"Could not read enough frames from video file {path}"
                          f" (expected {num_frames} frames, got {len(frames)})")
@@ -82,3 +92,12 @@ class VideoAsset:
         video_path = download_video_asset(self.name)
         ret = video_to_ndarrays(video_path, self.num_frames)
         return ret
+
+    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+        """
+        Read audio data from the video asset, used in Qwen2.5-Omni examples.
+        
+        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        """
+        video_path = download_video_asset(self.name)
+        return librosa.load(video_path, sr=sampling_rate)[0]
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 27bd292b51f22..f9c5ad4df54ea 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
             assert self.use_cuda_graph
 
         if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
             # decodes are scheduled together. In the first step, all the
             # prefills turn into decodes. This update reflects that
             # conversion.
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 0556c191ddea6..718b15e58785c 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -37,10 +37,13 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            is_block_tables_empty)
 from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
+logger = init_logger(__name__)
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
@@ -184,7 +187,7 @@ class FlashInferState(AttentionState):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -610,7 +613,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
 
     def prepare(self):
         self.slot_mapping: List[int] = []
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index f948fbc0a1096..15625612e08e4 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         super(AttentionImpl, self).__init__()
+        if use_irope:
+            logger.warning_once(
+                "Using irope in HPU is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.kv_cache_dtype = kv_cache_dtype
         self.num_heads = num_heads
         self.head_size = head_size
@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                 self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
                     FusedSDPA)
             except ImportError:
-                logger().warning("Could not import HPU FusedSDPA kernel. "
-                                 "vLLM will use native implementation.")
+                logger.warning("Could not import HPU FusedSDPA kernel. "
+                               "vLLM will use native implementation.")
 
-        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index d3c61ea26a02a..27959caa651a4 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 _PARTITION_SIZE = 512
 
@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Ipex is not supported yet, it will fall"
+                " back to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError(
                 "IPEX backend does not support block-sparse attention.")
@@ -212,8 +220,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
+                layer._k_scale_float,
+                layer._v_scale_float,
             )
 
         if attn_metadata.is_prompt:
@@ -298,8 +306,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
             else:
                 # Run PagedAttention V2.
@@ -331,8 +339,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 8d70afe282d68..a3dec0dbda9f8 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O
 
 NOTE: in the actual code, 
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
     `out_proj` is W_O
 
 
@@ -204,6 +204,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
@@ -217,9 +218,7 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 if HAS_TRITON:
     from vllm.attention.ops.triton_flash_attention import triton_attention
-    from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 else:
-    merge_attn_states = None
     triton_attention = None
 
 try:
@@ -667,7 +666,7 @@ class MLACommonMetadata(AttentionMetadata):
             assert num_seqs > num_queries
 
         if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
             # decodes are scheduled together. In the first step, all the
             # prefills turn into decodes. This update reflects that
             # conversion.
@@ -712,12 +711,24 @@ class MLACommonMetadata(AttentionMetadata):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
+        self._ops_advance_step(num_seqs=num_seqs,
+                               num_queries=num_queries,
+                               block_size=block_size,
+                               input_tokens=model_input.input_tokens,
+                               sampled_token_ids=sampled_token_ids,
+                               input_positions=model_input.input_positions)
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+        # here we use advance_step_flashinfo to update the paged_kv_* tensors
         ops.advance_step_flashattn(num_seqs=num_seqs,
                                    num_queries=num_queries,
                                    block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
+                                   input_tokens=input_tokens,
                                    sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
+                                   input_positions=input_positions,
                                    seq_lens=self.seq_lens_tensor,
                                    slot_mapping=self.slot_mapping,
                                    block_tables=self.block_tables)
@@ -728,6 +739,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     NOTE: Please read the comment at the top of the file before trying to 
     understand this class
     """
+    BLOCK_TABLE_EXTENDER: list[list[int]] = []
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.input_builder = input_builder
@@ -878,8 +890,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
         num_seqs = len(seq_lens)
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
+            self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
+                                     cuda_graph_pad_size)
             num_decode_tokens = batch_size - self.num_prefill_tokens
+
             block_tables = self._get_graph_runner_block_tables(
                 num_seqs, self.block_tables)
         else:
@@ -1044,8 +1058,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
-        self.triton_fa_func = triton_attention
 
+        self.triton_fa_func = triton_attention
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
@@ -1056,6 +1070,77 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
+                                         return_softmax_lse, **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
+            and not return_softmax_lse:
+            attn_out = self.triton_fa_func(
+                q,
+                k,
+                maybe_padded_v,
+                None,  # output
+                kwargs["cu_seqlens_q"],
+                kwargs["cu_seqlens_k"],
+                kwargs["max_seqlen_q"],
+                kwargs["max_seqlen_k"],
+                kwargs["causal"],
+                softmax_scale,
+                None,  # bias
+            )
+        if is_vllm_fa:
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+        else:
+            # Use return_attn_probs instead of return_softmax_lse for RoCM
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_attn_probs=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+
+        # Unpack the output if there is multiple results,
+        # triton always returns (output, softmax_lse),
+        # vllm_flash_attn returns (output, softmax_lse) when
+        #  `return_softmax_lse = True`
+        # flash_attn (RoCM) returns (output, softmax_lse, ...) when
+        #  `return_attn_probs = True`
+        rest = None
+        if isinstance(attn_out, tuple):
+            attn_out, *rest = attn_out
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            assert rest is not None
+            return attn_out, rest[0]
+        return attn_out
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
@@ -1177,40 +1262,19 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            if is_vllm_fa:
-                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_softmax_lse=True,
-                )
-            else:
-                attn_output, attn_softmax_lse, _ = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_attn_probs=True,
-                )
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
 
             if output is None:
                 output = attn_output
@@ -1253,58 +1317,22 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
-            output = self.triton_fa_func(
-                q,
-                k,
-                v_padded,
-                None,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.max_prefill_seq_len,
-                prefill_metadata.max_prefill_seq_len,
-                True,  # causal
-                self.scale,
-                None,  # attn_mask is None unless applying ALiBi mask
-            )
-            ## triton flash attention always return 2 objects
-            if not has_context:
-                output = output[0]
-        elif is_vllm_fa:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_softmax_lse=has_context,
-            )
-        else:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_attn_probs=has_context,
-            )
+        output = self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill_metadata.query_start_loc,
+            cu_seqlens_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
 
         if has_context:
             # ROCm flash_attn_varlen_func will return 3 objects instead of 2
-            suffix_output, suffix_lse, *rest = output
+            suffix_output, suffix_lse = output
             context_output, context_lse = self._compute_prefill_context( \
                 q, kv_c_and_k_pe_cache, attn_metadata)
 
@@ -1317,12 +1345,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return self.o_proj(output.flatten(start_dim=-2))[0]
 
     @abstractmethod
     def _forward_decode(
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2ee66ab9e966e..91d20a4e7bfc0 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Pallas is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
new file mode 100644
index 0000000000000..6e695b78e0e15
--- /dev/null
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Type, Union
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.envs as envs
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.backends.utils import (compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
+                                               get_aiter_mla_metadata)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_aiter_mla_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER_MLA
+
+
+class AiterMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AiterMLAImpl"]:
+        return AiterMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AiterMLAMetadata"]:
+        return AiterMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["AiterMLAState"]:
+        return AiterMLAState
+
+
+@dataclass
+class AiterMLAMetadata(MLACommonMetadata):
+    # The following 4 tensors are for current version of AITER MLA
+    block_table_bound: Optional[torch.Tensor] = None
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_lens: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self):
+        prefill_metadata = super().prefill_metadata
+        self._cached_prefill_metadata = prefill_metadata
+
+        if prefill_metadata is not None:
+            prefill_metadata.paged_kv_indptr = self.paged_kv_indptr
+            prefill_metadata.paged_kv_indices = self.paged_kv_indices
+            prefill_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            prefill_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_prefill_metadata = self.__class__(
+                **prefill_metadata.__dict__)
+
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+
+        self._cached_decode_metadata = decode_metadata
+
+        if decode_metadata is not None:
+            decode_metadata.paged_kv_indptr = self.paged_kv_indptr
+            decode_metadata.paged_kv_indices = self.paged_kv_indices
+            decode_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            decode_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_decode_metadata = self.__class__(
+                **decode_metadata.__dict__)
+
+        return self._cached_decode_metadata
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=input_tokens,
+            sampled_token_ids=sampled_token_ids,
+            input_positions=input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
+            block_table_bound=self.block_table_bound)
+
+
+class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        super().__init__(input_builder)
+        assert self.runner.model_config.max_model_len == 32768,\
+                "AITER MLA requires max model len to be set to 32768"
+        assert self.block_size == 1, "AITER MLA requires only block size 1."
+
+    def prepare(self):
+        super().prepare()
+        self.paged_kv_indices: list[int] = []
+        self.paged_kv_indptr: list[int] = [0]
+        self.paged_kv_last_page_lens: list[int] = []
+        self.total_blocks = 0
+
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+            if is_profile_run:
+                return
+
+            # Update paged_kv_* tensors only for non-profile run
+            block_table = block_tables[seq_id]
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
+        block_table_bound = seq_len // self.block_size + 1 \
+            if seq_len % self.block_size != 0 \
+            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_lens.append(last_page_len)
+
+    def build(self, seq_lens: list[int], query_lens: list[int],
+              cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata:
+        metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                                 batch_size)
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        if use_captured_graph:
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size)
+
+        # For current version of AITER MLA
+        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device=device,
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device=device,
+                                                  dtype=torch.int)
+            paged_kv_last_page_lens_tensor = torch.tensor(
+                self.paged_kv_last_page_lens, device=device, dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device=device,
+                                                   dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_lens_tensor = None
+            block_table_bound_tensor = None
+
+        metadata.paged_kv_indptr = paged_kv_indptr_tensor
+        metadata.paged_kv_indices = paged_kv_indices_tensor
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor
+        metadata.block_table_bound = block_table_bound_tensor
+
+        return metadata
+
+
+class AiterMLAState(MLACommonState[AiterMLAMetadata]):
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        kv_indices, kv_indptr, last_page_lens = get_aiter_mla_metadata(
+            max_batch_size=max_batch_size,
+            block_size=self.runner.block_size,
+            max_block_per_batch=self.runner.get_max_block_per_batch(),
+            device=self.runner.device)
+        self._paged_kv_indices_tensor = kv_indices
+        self._paged_kv_indptr_tensor = kv_indptr
+        self._paged_kv_last_page_lens_tensor = last_page_lens
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._paged_kv_indices_tensor
+        del self._paged_kv_indptr_tensor
+        del self._paged_kv_last_page_lens_tensor
+
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> AiterMLAMetadata:
+
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+
+        paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1]
+        paged_kv_indices = self._paged_kv_indices_tensor
+        paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[:
+                                                                       batch_size]
+
+        metadata.paged_kv_indptr = paged_kv_indptr
+        metadata.paged_kv_indices = paged_kv_indices
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata: AiterMLAMetadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers[
+            'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr
+        input_buffers[
+            "paged_kv_indices"] = attn_metadata.\
+            decode_metadata.paged_kv_indices
+        input_buffers[
+            "paged_kv_last_page_lens"] = attn_metadata.\
+            decode_metadata.paged_kv_last_page_lens
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata: AiterMLAMetadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[
+            0]
+        input_buffers["paged_kv_indptr"].copy_(
+            attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True)
+        input_buffers["paged_kv_indices"][:num_total_blocks].copy_(
+            attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True)
+        input_buffers["paged_kv_last_page_lens"].copy_(
+            attn_metadata.decode_metadata.paged_kv_last_page_lens,
+            non_blocking=True)
+
+
+class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "Aiter MLA does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        from aiter import flash_attn_varlen_func
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+            self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+            softmax_scale: float, return_softmax_lse: bool,
+            **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]:
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+
+        return output
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AiterMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
+
+        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.paged_kv_indptr,
+                             attn_metadata.paged_kv_indices,
+                             attn_metadata.paged_kv_last_page_lens)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c47202099ac60..37b6cadcb98ab 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -2,6 +2,7 @@
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
+from functools import cache
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -17,19 +18,43 @@ from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 logger = init_logger(__name__)
-
 _PARTITION_SIZE_ROCM = 256
-_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-_ON_NAVI = "gfx1" in _GPU_ARCH
-_ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+
+@cache
+def is_rocm_aiter_paged_attn_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+@cache
+def _get_paged_attn_module() -> PagedAttention:
+    """
+    Initializes the appropriate PagedAttention module from `attention/ops`, 
+    which is used as helper function
+    by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`.
+
+    The choice of attention module depends on whether 
+    AITER paged attention is enabled:
+    - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`.
+    - Otherwise, it defaults to using the original `PagedAttention`.
+    """
+    if is_rocm_aiter_paged_attn_enabled():
+        # Import AITERPagedAttention only when the flag is enabled
+        from vllm.attention.ops.rocm_aiter_paged_attn import (
+            AITERPagedAttention)
+        return AITERPagedAttention()
+    return PagedAttention()
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
@@ -58,8 +83,9 @@ class ROCmFlashAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
+        paged_attn = _get_paged_attn_module()
+        return paged_attn.get_kv_cache_shape(num_blocks, block_size,
+                                             num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -67,14 +93,16 @@ class ROCmFlashAttentionBackend(AttentionBackend):
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.copy_blocks(kv_caches, src_to_dists)
 
 
 @dataclass
@@ -465,11 +493,19 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in ROCm Flash Attention is not supported yet, it "
+                "will fail back to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             self.logits_soft_cap = 0.0
@@ -490,7 +526,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        self.paged_attn_module = _get_paged_attn_module()
+        supported_head_sizes = self.paged_attn_module.get_supported_head_sizes(
+        )
+
         if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
@@ -510,7 +549,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
-            self.attn_func = triton_attention
+            self.triton_attn_func = triton_attention
             logger.debug("Using Triton FA in ROCmBackend")
             if self.sliding_window != (-1, -1):
                 logger.warning("ROCm Triton FA does not currently support "
@@ -526,7 +565,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             else:
                 try:
                     from flash_attn import flash_attn_varlen_func  # noqa: F401
-                    self.attn_func = flash_attn_varlen_func
+                    self.fa_attn_func = flash_attn_varlen_func
                     logger.debug("Using CK FA in ROCmBackend")
                 except ModuleNotFoundError:
                     self.use_naive_attn = True
@@ -537,9 +576,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         "ROCm Naive FlashAttention does not support "
                         "attention logits soft capping.")
 
-                self.attn_func = _sdpa_attention
+                self.sdpa_attn_func = _sdpa_attention
                 logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
+        self.aiter_kv_scales_initialized = False
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -608,6 +649,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
+
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
             assert value is not None
@@ -616,12 +659,37 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         else:
             assert value is None
 
+        paged_attn = self.paged_attn_module
+
+        # Reshaping kv tensors is required for AITER paged attention kernel
+        # because it works on a different tensor shape,
+        # when the size of one element is one byte (int8/fp8 dtypes).
+        # This reshaping is only required on the first forward call
+        # and the kv cache must not be empty.
+        if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
+                and not self.aiter_kv_scales_initialized
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            self.aiter_kv_scales_initialized = True
+            k_scale.fill_(layer._k_scale.item())
+            v_scale.fill_(layer._v_scale.item())
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+
         # Only update KV cache for decoder self-attention
         # and encoder-decoder cross-attention
         if self.attn_type not in [
                 AttentionType.ENCODER, AttentionType.ENCODER_ONLY
         ] and kv_cache.numel() > 0:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
+            key_cache, value_cache = paged_attn.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
             if key is not None and value is not None:
@@ -629,7 +697,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 # cache. If kv_cache is not provided, the new key and value
                 # tensors are not cached. This happens during the initial
                 # memory profiling run.
-                PagedAttention.write_to_paged_cache(
+                paged_attn.write_to_paged_cache(
                     key,
                     value,
                     key_cache,
@@ -651,7 +719,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             assert attn_metadata.num_encoder_tokens is not None
             num_prefill_tokens = attn_metadata.num_encoder_tokens
 
-        output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
@@ -699,11 +766,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
-                    out, _ = self.attn_func(
+                    self.triton_attn_func(
                         query,
                         key,
                         value,
-                        None,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         key_seq_start_loc,
                         query_max_seq_len,
@@ -728,10 +795,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
                     # sdpa math backend attention
-                    out = self.attn_func(
+                    self.sdpa_attn_func(
                         query,
                         key,
                         value,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         num_prefill_tokens,
                         self.num_heads,
@@ -740,7 +808,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         attn_masks,
                     )
                 else:
-                    out = self.attn_func(
+                    # upstream FA does not support an output arg, copy
+                    output[:num_prefill_tokens] = self.fa_attn_func(
                         q=query,
                         k=key,
                         v=value,
@@ -755,33 +824,26 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         softcap=self.logits_soft_cap,
                     )
 
-                # common code for prefill
-                assert output[:num_prefill_tokens].shape == out.shape
-                if output.shape[0] > num_prefill_tokens:
-                    output[:num_prefill_tokens] = out
-                else:
-                    output = out
             else:
                 # prefix-enabled attention -
                 # not applicable for encoder-only models
                 if self.attn_type != AttentionType.ENCODER_ONLY:
-                    output[:
-                           num_prefill_tokens] = PagedAttention.forward_prefix(
-                               query,
-                               key,
-                               value,
-                               self.kv_cache_dtype,
-                               key_cache,
-                               value_cache,
-                               prefill_meta.block_tables,
-                               prefill_meta.query_start_loc,
-                               prefill_meta.seq_lens_tensor,
-                               prefill_meta.max_query_len,
-                               self.alibi_slopes,
-                               self.sliding_window[0],
-                               layer._k_scale,
-                               layer._v_scale,
-                           )
+                    output[:num_prefill_tokens] = paged_attn.forward_prefix(
+                        query,
+                        key,
+                        value,
+                        self.kv_cache_dtype,
+                        key_cache,
+                        value_cache,
+                        prefill_meta.block_tables,
+                        prefill_meta.query_start_loc,
+                        prefill_meta.seq_lens_tensor,
+                        prefill_meta.max_query_len,
+                        self.alibi_slopes,
+                        self.sliding_window[0],
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
         # Skip decode phase for encoder-only models
         if (decode_meta := attn_metadata.decode_metadata) and (
                 self.attn_type != AttentionType.ENCODER_ONLY):
@@ -790,9 +852,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             num_seqs, num_heads, head_size = decode_query.shape
             block_size = value_cache.shape[3]
             gqa_ratio = num_heads // self.num_kv_heads
-            use_custom = _use_rocm_custom_paged_attention(
+            use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
-                decode_meta.max_decode_seq_len)
+                decode_meta.max_decode_seq_len, self.sliding_window)
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
                                != AttentionType.ENCODER_DECODER else
@@ -813,12 +875,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
-                if num_prefill_tokens > 0:
-                    out = output[num_prefill_tokens:]
-                else:
-                    out = output
+
+                query_start_loc = None
                 ops.paged_attention_rocm(
-                    out,
+                    output[num_prefill_tokens:],
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -833,6 +893,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     decode_meta.seq_lens_tensor
                     if self.attn_type != AttentionType.ENCODER_DECODER else
                     decode_meta.encoder_seq_lens_tensor,
+                    query_start_loc,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -841,7 +902,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     layer._v_scale,
                 )
             else:
-                output[num_prefill_tokens:] = PagedAttention.forward_decode(
+                output[num_prefill_tokens:] = paged_attn.forward_decode(
                     decode_query,
                     key_cache,
                     value_cache,
@@ -870,7 +931,8 @@ def _sdpa_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    seq_lens: List[int],
+    output: torch.Tensor,
+    seq_lens: torch.Tensor,
     num_tokens: int,
     num_heads: int,
     head_size: int,
@@ -878,15 +940,14 @@ def _sdpa_attention(
     attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
-    output = torch.empty((num_tokens, num_heads, head_size),
-                         dtype=query.dtype,
-                         device=query.device)
+    assert output.shape == (num_tokens, num_heads, head_size)
+    assert output.dtype == query.dtype
+    assert output.device == query.device
 
     for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
-        with torch.backends.cuda.sdp_kernel(enable_math=True,
-                                            enable_flash=False,
-                                            enable_mem_efficient=False):
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
             sub_out = torch.nn.functional.scaled_dot_product_attention(
                 query[:, start:end, :],
                 key[:, start:end, :],
@@ -899,14 +960,3 @@ def _sdpa_attention(
             start = end
 
     return output
-
-
-def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
-                                     block_size: int, gqa_ratio: int,
-                                     max_seq_len: int) -> bool:
-    # rocm custom page attention not support on navi (gfx1*)
-    return (_ON_MI250_MI300 and not _ON_NAVI
-            and (qtype == torch.half or qtype == torch.bfloat16)
-            and (head_size == 64 or head_size == 128)
-            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index afe2acff4ab3d..c1bd638f2605d 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         if logits_soft_cap is not None:
             logger.warning_once("Torch SPDA does not support logits soft cap. "
                                 "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Torch SPDA is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index b4413c36b64a0..89f1ea9b8a570 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -2,8 +2,10 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
+from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -11,6 +13,7 @@ import torch
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -583,3 +586,24 @@ def get_num_prefill_decode_query_kv_tokens(
 
     return (num_prefill_query_tokens, num_prefill_kv_tokens,
             num_decode_query_tokens)
+
+
+@dataclass
+class MLADims:
+    q_lora_rank: Optional[int]
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 14c94c9ac4cab..a9d4a70b55a8c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         if logits_soft_cap is not None:
             logger.warning_once("XFormers does not support logits soft cap. "
                                 "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in XFormers is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -409,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
 
         self.attn_type = attn_type
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index dbf4723ee1bd7..68452f4c03b0d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,6 +10,9 @@ import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
@@ -329,17 +332,54 @@ class MultiHeadAttention(nn.Module):
         return out.reshape(bsz, q_len, -1)
 
 
+def wait_for_kv_layer_from_connector(layer_name: str):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.wait_for_layer_load(layer_name)
+
+
+def maybe_save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache_layer: List[torch.Tensor],
+):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
+    wait_for_kv_layer_from_connector(layer_name)
+
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
+    output = self.impl.forward(self, query, key, value, kv_cache,
+                               attn_metadata)
+
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+    return output
 
 
 def unified_attention_fake(
@@ -367,6 +407,7 @@ def unified_attention_with_output(
     output: torch.Tensor,
     layer_name: str,
 ) -> None:
+    wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
@@ -379,6 +420,8 @@ def unified_attention_with_output(
                       attn_metadata,
                       output=output)
 
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+
 
 def unified_attention_with_output_fake(
     query: torch.Tensor,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 48db3ebfd7412..1b47581641b06 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -10,6 +10,9 @@ import torch
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
+
 from .prefix_prefill import context_attention_fwd
 
 
@@ -33,26 +36,26 @@ def kernel_paged_attention_2d(
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
         num_queries_per_kv_padded: tl.constexpr,  # int
-        block_table_stride: tl.constexpr,  # int
-        query_stride_0: tl.constexpr,  # int
-        query_stride_1: tl.constexpr,  # int, should be equal to head_size
-        output_stride_0: tl.constexpr,  # int
-        output_stride_1: tl.constexpr,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
         BLOCK_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         x: tl.constexpr,  # int
-        stride_k_cache_0: tl.constexpr,  # int
-        stride_k_cache_1: tl.constexpr,  # int
-        stride_k_cache_2: tl.constexpr,  # int
-        stride_k_cache_3: tl.constexpr,  # int
-        stride_k_cache_4: tl.constexpr,  # int
-        stride_v_cache_0: tl.constexpr,  # int
-        stride_v_cache_1: tl.constexpr,  # int
-        stride_v_cache_2: tl.constexpr,  # int
-        stride_v_cache_3: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.int64,  # int
+        stride_k_cache_4: tl.int64,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.int64,  # int
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
 ):
@@ -212,6 +215,7 @@ def chunked_prefill_paged_decode(
     block_table,
     query_start_loc,
     seq_lens,
+    max_seq_len,
     max_query_len,
     k_scale,
     v_scale,
@@ -240,6 +244,7 @@ def chunked_prefill_paged_decode(
             b_loc=block_table,
             b_start_loc=query_start_loc,
             b_seq_len=seq_lens,
+            max_seq_len=max_seq_len,
             max_input_len=max_query_len,
             k_scale=k_scale,
             v_scale=v_scale,
@@ -275,43 +280,87 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
-    kernel_paged_attention_2d[(
-        num_seqs,
-        num_kv_heads,
-    )](
-        output_ptr=output,
-        query_ptr=query,
-        key_cache_ptr=key_cache,
-        value_cache_ptr=value_cache,
-        block_tables_ptr=block_table,
-        seq_lens_ptr=seq_lens,
-        alibi_slopes_ptr=alibi_slopes,
-        scale=sm_scale,
-        k_scale=k_scale,
-        v_scale=v_scale,
-        num_query_heads=num_query_heads,
-        num_queries_per_kv=num_queries_per_kv,
-        num_queries_per_kv_padded=num_queries_per_kv_padded,
-        block_table_stride=block_table.stride(0),
-        query_stride_0=query.stride(0),
-        query_stride_1=query.stride(1),
-        output_stride_0=output.stride(0),
-        output_stride_1=output.stride(1),
-        BLOCK_SIZE=block_size,
-        HEAD_SIZE=head_size,
-        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-        USE_ALIBI_SLOPES=use_alibi_slopes,
-        SLIDING_WINDOW=sliding_window,
-        x=key_cache.shape[4],
-        stride_k_cache_0=key_cache.stride(0),
-        stride_k_cache_1=key_cache.stride(1),
-        stride_k_cache_2=key_cache.stride(2),
-        stride_k_cache_3=key_cache.stride(3),
-        stride_k_cache_4=key_cache.stride(4),
-        stride_v_cache_0=value_cache.stride(0),
-        stride_v_cache_1=value_cache.stride(1),
-        stride_v_cache_2=value_cache.stride(2),
-        stride_v_cache_3=value_cache.stride(3),
-        filter_by_query_len=True,
-        query_start_len_ptr=query_start_loc,
-    )
+    use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
+                                                 block_size,
+                                                 num_queries_per_kv,
+                                                 max_seq_len, sliding_window)
+    if use_custom:
+        _PARTITION_SIZE_ROCM = 256
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
+                              _PARTITION_SIZE_ROCM)
+        assert _PARTITION_SIZE_ROCM % block_size == 0
+        total_num_seq = query.shape[0]
+        tmp_output = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions,
+                  head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        ops.paged_attention_rocm(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale=sm_scale,
+            block_tables=block_table,
+            seq_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            block_size=block_size,
+            max_seq_len=max_seq_len,
+            alibi_slopes=alibi_slopes,
+            kv_cache_dtype=kv_cache_dtype,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+    else:
+        kernel_paged_attention_2d[(
+            num_seqs,
+            num_kv_heads,
+        )](
+            output_ptr=output,
+            query_ptr=query,
+            key_cache_ptr=key_cache,
+            value_cache_ptr=value_cache,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seq_lens,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=sm_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            num_queries_per_kv_padded=num_queries_per_kv_padded,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=query.stride(0),
+            query_stride_1=query.stride(1),
+            output_stride_0=output.stride(0),
+            output_stride_1=output.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            SLIDING_WINDOW=sliding_window,
+            x=key_cache.shape[4],
+            stride_k_cache_0=key_cache.stride(0),
+            stride_k_cache_1=key_cache.stride(1),
+            stride_k_cache_2=key_cache.stride(2),
+            stride_k_cache_3=key_cache.stride(3),
+            stride_k_cache_4=key_cache.stride(4),
+            stride_v_cache_0=value_cache.stride(0),
+            stride_v_cache_1=value_cache.stride(1),
+            stride_v_cache_2=value_cache.stride(2),
+            stride_v_cache_3=value_cache.stride(3),
+            filter_by_query_len=True,
+            query_start_len_ptr=query_start_loc,
+        )
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
new file mode 100644
index 0000000000000..f9fcfe6a63386
--- /dev/null
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
+    # is not support for FP8 dtype, fallback to use Triton kernel.
+    def supported_dtypes(o: torch.Tensor) -> bool:
+        return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA
+    # kernel load/store 128b(16 bytes) per memory issue within
+    # thread. Namely, the headsize(headdim) must be multiple of
+    # pack_size (float32 -> 4, half/bfloat16 -> 8).
+    def supported_headdim(o: torch.Tensor) -> bool:
+        headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        if o.dtype == torch.float32:
+            return headdim % 4 == 0
+        return headdim % 8 == 0
+
+    if (current_platform.is_cuda() and supported_dtypes(output)
+            and supported_headdim(output)):
+        from vllm._custom_ops import merge_attn_states
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
+    else:
+        from vllm.attention.ops.triton_merge_attn_states import (
+            merge_attn_states)
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index dcf9b0ef1f2a1..8c9145bb99e8c 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -144,8 +144,7 @@ def transform_block_tables_for_indirect_load(
 def load_kv_tile_from_cache(
     cur_k_tile,
     cur_v_tile,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     large_k_tile_idx,
     num_blocks_per_large_tile,
@@ -169,8 +168,8 @@ def load_kv_tile_from_cache(
     for load_idx in nl.affine_range(num_loads):
         i_p = nl.arange(B_P_SIZE)[:, None]
         i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        loaded = nl.load(key_cache[block_tables[load_idx, i_p,
-                                                large_k_tile_idx], i_f])
+        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
         if cur_k_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
         # Transpose SBUF tensor using PE
@@ -185,7 +184,7 @@ def load_kv_tile_from_cache(
 
     # load value cache
     for load_idx in nl.affine_range(num_loads):
-        loaded = nl.load(value_cache[block_tables[load_idx, i_p,
+        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
                                                   large_k_tile_idx], i_f])
         if cur_v_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
@@ -418,8 +417,7 @@ def flash_paged_attention(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     mask,
     softmax_scale=None,
@@ -434,8 +432,7 @@ def flash_paged_attention(
       - query: shape   (1, n_heads, d, seq_q)
       - key:   shape   (1, n_kv_heads, d, seq_k)
       - value: shape   (1, n_kv_heads, seq_v, d)
-      - key_cache: (num_blocks, n_kv_heads, block_size, d)
-      - value_cache: (num_blocks, n_kv_heads, block_size, d)
+      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
       - block_tables: (num_active_blocks, )
       - mask: (seq_q, num_active_blocks * block_size + seq_q)
       - o: shape (1, n_heads, seq_q, d)
@@ -444,12 +441,12 @@ def flash_paged_attention(
       - We use continuous batching by default, so the batch dimension is
         always 1, and different requests are concatenated along sequence
         dimension.
-      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+      - We use paged cache blocks (kv_cache) to store KV cache.
 
     IO tensor dtypes:
       - This kernel assumes all IO tensors have the same dtype except for
         block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be
+      - If mixed_precision is True, then all Tensor Engine operation will be
         performed in bfloat16 and accumulation will be performed in float32.
         Otherwise the intermediates will be in the same type as the inputs.
 
@@ -475,15 +472,13 @@ def flash_paged_attention(
     b, h, d, seqlen_q = query.shape
     B_D_SIZE = d
     n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    num_blocks, k_h, block_size, _ = key_cache.shape
+    _, num_blocks, k_h, block_size, _ = kv_cache.shape
     q_h_per_k_h = h // k_h
     assert b == 1, f"invalid batch size {b=}"
     assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
-    cache_shape = (num_blocks, k_h, block_size, d)
-    assert (tuple(key_cache.shape) == cache_shape
-            ), f"{key_cache.shape=} mismatch, expect {cache_shape}"
-    assert (tuple(value_cache.shape) == cache_shape
-            ), f"{value_cache.shape=} mismatch, expect {cache_shape}"
+    cache_shape = (2, num_blocks, k_h, block_size, d)
+    assert (tuple(kv_cache.shape) == cache_shape
+            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
     assert key is None or tuple(key.shape) == (
         1,
         k_h,
@@ -580,13 +575,13 @@ def flash_paged_attention(
         head_id=head_id,
     )
 
-    # Flatten KV cache to be 2D for loading into SBUF
+    # Flatten KV cache to be 3D for loading into SBUF
     new_cache_shape = (
+        2,
         num_blocks * k_h * block_size_tiling_factor,
         tiled_block_size * d,
     )
-    key_cache = key_cache.reshape(new_cache_shape)
-    value_cache = value_cache.reshape(new_cache_shape)
+    kv_cache = kv_cache.reshape(new_cache_shape)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -621,8 +616,7 @@ def flash_paged_attention(
         load_kv_tile_from_cache(
             cur_k_tile=cur_k_tile,
             cur_v_tile=cur_v_tile,
-            key_cache=key_cache,
-            value_cache=value_cache,
+            kv_cache=kv_cache,
             block_tables=block_tables_sbuf,
             large_k_tile_idx=large_k_tile_idx,
             num_blocks_per_large_tile=num_blocks_per_large_tile,
@@ -821,8 +815,7 @@ def flash_attn_varlen_nkifunc(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_table,
     attn_mask,
     n_kv_head=None,
@@ -838,8 +831,7 @@ def flash_attn_varlen_nkifunc(
       - query: (1, n_heads, d, seq_q)
       - key:   (1, n_kv_heads, d, seq_k)
       - value: (1, n_kv_heads, seq_v, d)
-      - key_cache:   (n_blocks, n_kv_heads, block_size, d)
-      - value_cache: (n_blocks, n_kv_heads, block_size, d)
+      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
       - block_tables: (n_active_blocks, )
       - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
 
@@ -849,17 +841,17 @@ def flash_attn_varlen_nkifunc(
         for better DMA throughput
     """
     if n_kv_head is None:
-        n_kv_head = key_cache.shape[1]
-    assert key_cache.shape[1] == n_kv_head
+        n_kv_head = kv_cache.shape[2]
+    assert kv_cache.shape[0] == 2
+    assert kv_cache.shape[2] == n_kv_head
     if head_size is None:
-        head_size = key_cache.shape[-1]
+        head_size = kv_cache.shape[-1]
 
     kwargs = dict(
         query=query,
         key=key,
         value=value,
-        key_cache=key_cache,
-        value_cache=value_cache,
+        kv_cache=kv_cache,
         block_tables=block_table,
         mask=attn_mask,
         softmax_scale=1.0 / (head_size**0.5),
@@ -874,8 +866,7 @@ def flash_attn_varlen_nkifunc(
 def reshape_and_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """
@@ -886,29 +877,29 @@ def reshape_and_cache(
             (num_tokens, n_kv_head, d_head)
         value (torch.Tensor): Value tensor with shape 
             (num_tokens, n_kv_head, d_head)
-        key_cache (torch.Tensor): Key cache tensor with shape 
-            (num_blocks, n_kv_head, block_size, d_head)
-        value_cache (torch.Tensor): Value cache tensor with shape
-            (num_blocks, n_kv_head, block_size, d_head) 
+        kv_cache (torch.Tensor): Key/value cache tensor with shape 
+            (2, num_blocks, n_kv_head, block_size, d_head)
         slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
             with shape (num_tokens)
 
     Returns:
-        None: Updates the key_cache and value_cache tensors in-place
+        None: Updates the kv_cache tensor in-place
     """
-    block_size = key_cache.size(2)
+    block_size = kv_cache.size(3)
+    n_kv_head = key.size(1)
 
     # Calculate indices with explicit floor division
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     block_offsets = slot_mapping % block_size
 
-    # Update caches using index_put_
-    key_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(key_cache.size(1),
-                      device=key.device), block_offsets.unsqueeze(1)), key)
+    # Create the head indices tensor
+    head_indices = torch.arange(n_kv_head, device=key.device)
 
-    value_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(value_cache.size(1),
-                      device=value.device), block_offsets.unsqueeze(1)), value)
+    # Update caches using index_put_
+    kv_cache.index_put_(
+        (torch.tensor([0], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), key)
+
+    kv_cache.index_put_(
+        (torch.tensor([1], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), value)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index fd703413db908..827c3041a682e 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -209,6 +209,7 @@ class PagedAttention:
         v_scale: torch.Tensor,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
+        max_seq_len = None
         context_attention_fwd(
             query,
             key,
@@ -221,6 +222,7 @@ class PagedAttention:
             # query_start_loc is (batch_size + 1,)
             query_start_loc,
             seq_lens_tensor,
+            max_seq_len,
             max_query_len,
             k_scale,
             v_scale,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e85ec605ad2f9..a8c8d8409620c 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -16,830 +16,778 @@ NUM_WARPS = 4 if current_platform.is_rocm() else 8
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
 
-if triton.__version__ >= "2.1.0":
 
-    @triton.jit
-    def _fwd_kernel(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SLIDING_WINDOW: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
+# Here's an example autotuner config for this kernel. This config does provide
+# a performance improvement, but dramatically increases first call latency in
+# triton 3.2. Because of this tradeoff, it's currently commented out.
+# @triton.autotune(
+#     configs=[
+#         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \
+#                         "num_unroll_cache": 4, \
+#                         "num_unroll_request": 1 } | \
+#                         ({"kpack": 2, "waves_per_eu": 2} \
+#                             if current_platform.is_rocm() else {}), \
+#                         num_warps=4, \
+#                         num_stages=1)
+#     ],
+#     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
+# )
+@triton.jit
+def _fwd_kernel(Q,
+                K,
+                V,
+                K_cache,
+                V_cache,
+                B_Loc,
+                sm_scale,
+                k_scale,
+                v_scale,
+                B_Start_Loc,
+                B_Seqlen,
+                x: tl.constexpr,
+                Out,
+                stride_b_loc_b,
+                stride_b_loc_s,
+                stride_qbs,
+                stride_qh,
+                stride_qd,
+                stride_kbs,
+                stride_kh,
+                stride_kd,
+                stride_vbs,
+                stride_vh,
+                stride_vd,
+                stride_obs,
+                stride_oh,
+                stride_od,
+                stride_k_cache_bs,
+                stride_k_cache_h,
+                stride_k_cache_d,
+                stride_k_cache_bl: tl.constexpr,
+                stride_k_cache_x,
+                stride_v_cache_bs,
+                stride_v_cache_h,
+                stride_v_cache_d,
+                stride_v_cache_bl,
+                num_queries_per_kv: tl.constexpr,
+                IN_PRECISION: tl.constexpr,
+                BLOCK_M: tl.constexpr,
+                BLOCK_DMODEL: tl.constexpr,
+                BLOCK_DMODEL_PADDED: tl.constexpr,
+                BLOCK_SIZE: tl.constexpr,
+                BLOCK_N: tl.constexpr,
+                SLIDING_WINDOW: tl.constexpr,
+                num_unroll_cache: tl.constexpr,
+                num_unroll_request: tl.constexpr,
+                SKIP_DECODE: tl.constexpr,
+                MAX_Q_LEN: tl.constexpr = 0,
+                MAX_CTX_LEN: tl.constexpr = 0):
 
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
 
-        cur_kv_head = cur_head // num_queries_per_kv
+    cur_kv_head = cur_head // num_queries_per_kv
 
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        # start position inside of the query
-        # generally, N goes over kv, while M goes over query_len
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        # [N]; starts at 0
-        offs_n = tl.arange(0, BLOCK_N)
-        # [D]; starts at 0
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        # [M]; starts at current position in query
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        # [M,D]
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
-            0).to(tl.int1)  # [D]
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_query_len),
-                    other=0.0)  # [M,D]
-
-        # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
-                       dtype=tl.float32)  # [M,D]
-
-        # compute query against context (no causal mask here)
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)  # [N]
-            # [D,N]
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            # [N,D]
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-            if SLIDING_WINDOW > 0:
-                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
-                # Q entries in sequence
-                # (start_n + offs_n[None, :]) are the positions of
-                # KV entries in sequence
-                # So the condition makes sure each entry in Q only attends
-                # to KV entries not more than SLIDING_WINDOW away.
-                #
-                # We can't use -inf here, because the
-                # sliding window may lead to the entire row being masked.
-                # This then makes m_ij contain -inf, which causes NaNs in
-                # exp().
-                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
-                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
-                              -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)  # [M]
-            p = tl.exp(qk - m_ij[:, None])  # [M,N]
-            l_ij = tl.sum(p, 1)  # [M]
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)  # [M]
-            alpha = tl.exp(m_i - m_i_new)  # [M]
-            beta = tl.exp(m_ij - m_i_new)  # [M]
-            l_i_new = alpha * l_i + beta * l_ij  # [M]
-
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)  # [N,D]
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        # block_mask is 0 when we're already past the current query length
-        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
-
-        # compute query against itself (with causal mask)
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk *= sm_scale
-            # apply causal mask
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-            if SLIDING_WINDOW > 0:
-                qk = tl.where(
-                    offs_m[:, None] - (start_n + offs_n[None, :])
-                    < SLIDING_WINDOW, qk, -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_query_len))
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @triton.jit
-    def _fwd_kernel_flash_attn_v2(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
+    # start position inside of the query
+    # generally, N goes over kv, while M goes over query_len
+    block_start_loc = BLOCK_M * start_m
 
-        cur_kv_head = cur_head // num_queries_per_kv
+    # initialize offsets
+    # [BLOCK_SIZE]; starts at 0
+    offs_bs_n = tl.arange(0, BLOCK_SIZE)
+    # [N]; starts at 0
+    offs_n = tl.arange(0, BLOCK_N)
+    # [D]; starts at 0
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    # [M]; starts at current position in query
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # [M,D]
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+        0).to(tl.int1)  # [D]
 
-        block_start_loc = BLOCK_M * start_m
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_query_len),
+                other=0.0)  # [M,D]
 
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
 
-        q = tl.load(Q + off_q,
-                    mask=offs_m[:, None]
+    # compute query against context (no causal mask here)
+    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
+                            loop_unroll_factor=num_unroll_cache):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+        # [D,BLOCK_SIZE]
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+
+        # [BLOCK_SIZE,D]
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 offs_bs_n[:, None] * stride_v_cache_bl)
+
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            k_load = tl.load(
+                K_cache + off_k,
+                mask=dim_mask[:, None] &
+                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0)  # [D,N]
+        else:
+            k_load = tl.load(K_cache + off_k)
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+        if SLIDING_WINDOW > 0:
+            # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+            # Q entries in sequence
+            # (start_n + offs_bs_n[None, :]) are the positions of
+            # KV entries in sequence
+            # So the condition makes sure each entry in Q only attends
+            # to KV entries not more than SLIDING_WINDOW away.
+            #
+            # We can't use -inf here, because the
+            # sliding window may lead to the entire row being masked.
+            # This then makes m_ij contain -inf, which causes NaNs in
+            # exp().
+            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
+                          -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            v_load = tl.load(
+                V_cache + off_v,
+                mask=dim_mask[None, :] &
+                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0)  # [N,D]
+        else:
+            v_load = tl.load(V_cache + off_v)
+
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # block_mask is 0 when we're already past the current query length
+    block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+    # compute query against itself (with causal mask)
+    for start_n in tl.range(0, \
+                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
+                        loop_unroll_factor=num_unroll_request):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=dim_mask[:, None] &
+                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk *= sm_scale
+        # apply causal mask
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+        if SLIDING_WINDOW > 0:
+            qk = tl.where(
+                offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
+                qk, -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=dim_mask[None, :] &
+                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                    other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :])
                     < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
 
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
 
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        # acc /= l_i[:, None]
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_alibi(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        Alibi_slopes,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-        # attn_bias[]
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        # cur_batch_seq_len: the length of prompts
-        # cur_batch_ctx_len: the length of prefix
-        # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
 
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = 0
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
 
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
 
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,  # head size
+    BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+    BLOCK_N: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
 
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
+    cur_kv_head = cur_head // num_queries_per_kv
 
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        # init alibi
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = cur_batch_ctx_len
-        # # init debugger
-        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
-        # offset_db_k = tl.arange(0, BLOCK_N)
-        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        acc = acc / l_i[:, None]
-
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @torch.inference_mode()
-    def context_attention_fwd(q,
-                              k,
-                              v,
-                              o,
-                              kv_cache_dtype: str,
-                              k_cache,
-                              v_cache,
-                              b_loc,
-                              b_start_loc,
-                              b_seq_len,
-                              max_input_len,
-                              k_scale: torch.Tensor,
-                              v_scale: torch.Tensor,
-                              alibi_slopes=None,
-                              sliding_window=None,
-                              sm_scale=None,
-                              skip_decode=False):
+    block_start_loc = BLOCK_M * start_m
 
-        q_dtype_is_f32 = q.dtype is torch.float32
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k_load = tl.load(K_cache + off_k,
+                         mask=dim_mask[:, None] &
+                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                         other=0.0)  # [D,N]
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v_load = tl.load(V_cache + off_v,
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                         other=0.0)
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] &
+             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    return
+
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                          k,
+                          v,
+                          o,
+                          kv_cache_dtype: str,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          max_seq_len,
+                          max_input_len,
+                          k_scale: torch.Tensor,
+                          v_scale: torch.Tensor,
+                          alibi_slopes=None,
+                          sliding_window=None,
+                          sm_scale=None,
+                          skip_decode=False):
+
+    q_dtype_is_f32 = q.dtype is torch.float32
+
+    # Turing does have tensor core for float32 multiplication
+    # use ieee as fallback for triton kernels work. There is also
+    # warning on vllm/config.py to inform users this fallback
+    # implementation
+    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert (k_cache.dtype == torch.uint8)
+        assert (v_cache.dtype == torch.uint8)
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        k_cache = k_cache.view(target_dtype)
+        v_cache = v_cache.view(target_dtype)
+
+    if (k_cache.dtype == torch.uint8
+            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
+        raise ValueError("kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel")
+
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    # round up Lk to a power of 2 - this is required for Triton block size
+    Lk_padded = triton.next_power_of_2(Lk)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    assert batch + 1 == len(b_start_loc)
+
+    # 0 means "disable"
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if alibi_slopes is not None:
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
         BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
-
-        # Turing does have tensor core for float32 multiplication
-        # use ieee as fallback for triton kernels work. There is also
-        # warning on vllm/config.py to inform users this fallback
-        # implementation
-        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
-
-        # Conversion of FP8 Tensor from uint8 storage to
-        # appropriate torch.dtype for interpretation by Triton
-        if "fp8" in kv_cache_dtype:
-            assert (k_cache.dtype == torch.uint8)
-            assert (v_cache.dtype == torch.uint8)
-
-            if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = torch.float8_e4m3fn
-            elif kv_cache_dtype == "fp8_e5m2":
-                target_dtype = torch.float8_e5m2
-            else:
-                raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
-
-            k_cache = k_cache.view(target_dtype)
-            v_cache = v_cache.view(target_dtype)
-
-        if (k_cache.dtype == torch.uint8
-                or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
-            raise ValueError("kv_cache_dtype='auto' unsupported for\
-                FP8 KV Cache prefill kernel")
-
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        # round up Lk to a power of 2 - this is required for Triton block size
-        Lk_padded = triton.next_power_of_2(Lk)
-
-        if sm_scale is None:
-            sm_scale = 1.0 / (Lq**0.5)
-        batch, head = b_seq_len.shape[0], q.shape[1]
-        num_queries_per_kv = q.shape[1] // k.shape[1]
-
-        assert batch + 1 == len(b_start_loc)
-        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-
-        # 0 means "disable"
-        if sliding_window is None or sliding_window <= 0:
-            sliding_window = 0
-
-        if alibi_slopes is not None:
-            _fwd_kernel_alibi[grid](
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                b_loc,
-                sm_scale,
-                k_scale,
-                v_scale,
-                b_start_loc,
-                b_seq_len,
-                alibi_slopes,
-                v_cache.shape[3],
-                k_cache.shape[4],
-                o,
-                b_loc.stride(0),
-                b_loc.stride(1),
-                q.stride(0),
-                q.stride(1),
-                q.stride(2),
-                k.stride(0),
-                k.stride(1),
-                k.stride(2),
-                v.stride(0),
-                v.stride(1),
-                v.stride(2),
-                o.stride(0),
-                o.stride(1),
-                o.stride(2),
-                k_cache.stride(0),
-                k_cache.stride(1),
-                k_cache.stride(2),
-                k_cache.stride(3),
-                k_cache.stride(
-                    4
-                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
-                v_cache.stride(0),
-                v_cache.stride(1),
-                v_cache.stride(2),
-                v_cache.stride(
-                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
-                num_queries_per_kv=num_queries_per_kv,
-                IN_PRECISION=IN_PRECISION,
-                BLOCK_M=BLOCK,
-                BLOCK_DMODEL=Lk,
-                BLOCK_DMODEL_PADDED=Lk_padded,
-                BLOCK_N=BLOCK,
-                SKIP_DECODE=skip_decode,
-                num_warps=NUM_WARPS,
-                num_stages=1,
-            )
-            return
-
-        _fwd_kernel[grid](
+        # batch, head,
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+        _fwd_kernel_alibi[grid](
             q,
             k,
             v,
@@ -851,6 +799,7 @@ if triton.__version__ >= "2.1.0":
             v_scale,
             b_start_loc,
             b_seq_len,
+            alibi_slopes,
             v_cache.shape[3],
             k_cache.shape[4],
             o,
@@ -885,9 +834,69 @@ if triton.__version__ >= "2.1.0":
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
-            SLIDING_WINDOW=sliding_window,
             SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
         return
+
+    max_seq_len = 0 if max_seq_len is None else max_seq_len
+    extra_kargs = {}
+    if current_platform.is_rocm():
+        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+
+    grid = lambda META: (batch, head,
+                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        k_scale,
+        v_scale,
+        b_start_loc,
+        b_seq_len,
+        k_cache.shape[4],
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        BLOCK_SIZE=v_cache.shape[3],
+        num_queries_per_kv=num_queries_per_kv,
+        IN_PRECISION=IN_PRECISION,
+        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL_PADDED=Lk_padded,
+        SLIDING_WINDOW=sliding_window,
+        SKIP_DECODE=skip_decode,
+        BLOCK_M=128,
+        BLOCK_N=64,
+        num_unroll_cache=4,
+        num_unroll_request=1,
+        num_warps=4,
+        num_stages=1,
+        **extra_kargs)
+    return
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
new file mode 100644
index 0000000000000..1c90f8c19b09c
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+
+
+def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
+                           max_block_per_batch: int,
+                           device: torch.device) -> tuple[torch.Tensor, ...]:
+    paged_kv_indices = torch.zeros(max_batch_size * max_block_per_batch,
+                                   dtype=torch.int32,
+                                   device=device)
+    paged_kv_indptr = torch.zeros(max_batch_size + 1,
+                                  dtype=torch.int32,
+                                  device=device)
+    paged_kv_last_page_lens = torch.full((max_batch_size, ),
+                                         block_size,
+                                         dtype=torch.int32)
+    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens
+
+
+def aiter_mla_decode_fwd(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    sm_scale: float,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    logit_cap: float = 0.0,
+):
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(q,
+                   kv_buffer.view(-1, 1, 1, q.shape[-1]),
+                   o,
+                   kv_indptr,
+                   kv_indices,
+                   kv_last_page_lens,
+                   sm_scale=sm_scale,
+                   logit_cap=logit_cap)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
new file mode 100644
index 0000000000000..0f3cf1842c805
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import aiter as rocm_aiter
+import torch
+
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class AITERPagedAttention(PagedAttention):
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache, slot_mapping,
+                                                kv_cache_dtype, k_scale,
+                                                v_scale)
+        else:
+            kv_cache_torch_dtype = (FP8_DTYPE
+                                    if "fp8" in kv_cache_dtype else torch.int8)
+            key_cache = key_cache.view(kv_cache_torch_dtype)
+            value_cache = value_cache.view(kv_cache_torch_dtype)
+
+            rocm_aiter.reshape_and_cache_with_pertoken_quant(
+                key, value, key_cache, value_cache, k_scale, v_scale,
+                slot_mapping.flatten(), True)
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            return PagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_tables=block_tables,
+                seq_lens=seq_lens,
+                max_seq_len=max_seq_len,
+                kv_cache_dtype=kv_cache_dtype,
+                num_kv_heads=num_kv_heads,
+                scale=scale,
+                alibi_slopes=alibi_slopes,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                tp_rank=tp_rank,
+                blocksparse_local_blocks=blocksparse_local_blocks,
+                blocksparse_vert_stride=blocksparse_vert_stride,
+                blocksparse_block_size=blocksparse_block_size,
+                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+
+        if "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
+
+        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
+                              seq_lens, max_num_blocks_per_seq, k_scale,
+                              v_scale, output)
+        return output
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index 40daec3ec1242..35ee0835f42a1 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -39,11 +39,12 @@ is_hip_ = current_platform.is_rocm()
 
 logger = logging.getLogger(__name__)
 
-# TODO: Remove this when triton>=3.2.0. This issue will not affect performance
-# and accuracy.
-logger.warning(
-    "The following error message 'operation scheduled before its operands' "
-    "can be ignored.")
+# Only print the following warnings when triton version < 3.2.0.
+# The issue won't affect performance or accuracy.
+if triton.__version__ < '3.2.0':
+    logger.warning(
+        "The following error message 'operation scheduled before its operands' "
+        "can be ignored.")
 
 
 @triton.jit
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 31545b607fecd..250426d9faa5b 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -54,10 +54,22 @@ def merge_attn_states_kernel(
 
     p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
     s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+
+    # FA2 and FA3 have different behavior for when the sum-exp is 0, this namely
+    # arises with 0 len seqlens. FA3 returns -inf here while FA2 returns inf.
+    # If we see an inf assume FA2 and convert inf to -inf for consistency
+    # and correctness. Inf generally doesn't make sense in this context outside
+    # of undefined-behavior/FA2-case, so I think this a safe assumption.
+    p_lse = float('-inf') if p_lse == float('inf') else p_lse
+    s_lse = float('-inf') if s_lse == float('inf') else s_lse
+
     max_lse = tl.maximum(p_lse, s_lse)
     p_lse = p_lse - max_lse
     s_lse = s_lse - max_lse
-    out_se = (tl.exp(p_lse) + tl.exp(s_lse))
+    # Will reuse precomputed Exp values for scale factor computation.
+    p_se = tl.exp(p_lse)
+    s_se = tl.exp(s_lse)
+    out_se = (p_se + s_se)
 
     if OUTPUT_LSE:
         out_lse = tl.log(out_se) + max_lse
@@ -75,8 +87,8 @@ def merge_attn_states_kernel(
     # NOTE(woosuk): Be careful with the numerical stability.
     # We should compute the scale first, and then multiply it with the output.
     # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
-    p_scale = tl.exp(p_lse) / out_se
-    s_scale = tl.exp(s_lse) / out_se
+    p_scale = p_se / out_se
+    s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
     tl.store(output + token_idx * num_heads * HEAD_SIZE +
              head_idx * HEAD_SIZE + head_arange,
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 5d4ebdb7acbcf..967510abaeb9b 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -38,9 +38,18 @@ class BeamSearchOutput:
 
 class BeamSearchInstance:
 
-    def __init__(self, prompt_tokens: list[int]):
+    def __init__(
+        self,
+        prompt_tokens: list[int],
+        logprobs: Optional[list[dict[int, Logprob]]] = None,
+        **kwargs,
+    ):
         self.beams: list[BeamSearchSequence] = [
-            BeamSearchSequence(tokens=prompt_tokens, logprobs=[])
+            BeamSearchSequence(
+                tokens=prompt_tokens,
+                logprobs=[] if logprobs is None else list(logprobs),
+                **kwargs,
+            )
         ]
         self.completed: list[BeamSearchSequence] = []
 
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
new file mode 100644
index 0000000000000..299c888c2e7b9
--- /dev/null
+++ b/vllm/benchmarks/datasets.py
@@ -0,0 +1,831 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+
+TODO: Implement CustomDataset to parse a JSON file and convert its contents into
+SampleRequest instances, similar to the approach used in ShareGPT.
+"""
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  
+        
+        Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+
+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(self, tokenizer: PreTrainedTokenizerBase,
+               num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
+        vocab_size = tokenizer.vocab_size
+
+        prefix_token_ids = (np.random.randint(
+            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(input_len * (1 - range_ratio))
+        input_high = int(input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low,
+                    output_high)
+
+        input_lens = np.random.randint(input_low,
+                                       input_high + 1,
+                                       size=num_requests)
+        output_lens = np.random.randint(output_low,
+                                        output_high + 1,
+                                        size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            total_input_len = prefix_len + int(input_lens[i])
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                ))
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = (len(completion_ids)
+                              if output_len is None else output_len)
+            if not is_valid_sequence(prompt_len,
+                                     new_output_len,
+                                     skip_min_output_len_check=output_len
+                                     is not None):
+                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                ))
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens)
+                      for tokens in tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(base_msg,
+                                                 add_generation_prompt=True,
+                                                 tokenize=False)
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset}).")
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        while len(samples) < num_requests:
+            extra_lines = random.choices(self.data,
+                                         k=num_input_lines - num_prefix_lines)
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False)
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted
+                        if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self, ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        try:
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                "Pandas is required for BurstGPTDataset. Please install it "
+                "using `pip install pandas`.") from e
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests,
+                                    random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Base Implementation
+# -----------------------------------------------------------------------------
+class HuggingFaceDataset(BenchmarkDataset):
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+    def __init__(
+        self,
+        dataset_path: str,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+        self.load_data()
+
+    def load_data(self) -> None:
+        """Load data from HuggingFace datasets."""
+        try:
+            from datasets import load_dataset
+        except ImportError as e:
+            raise ImportError(
+                "Hugging Face datasets library is required for this dataset. "
+                "Please install it using `pip install datasets`.") from e
+
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                    prompt_len, completion_len):
+                continue
+            mm_content = process_image(
+                item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(HuggingFaceDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
new file mode 100644
index 0000000000000..06f6848f50cb4
--- /dev/null
+++ b/vllm/benchmarks/latency.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k]
+                    for k in ["avg_latency", "percentiles"]})
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=30,
+                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default=None,
+        help=("path to save the pytorch profiler output. Can be visualized "
+              "with ui.perfetto.dev or Tensorboard."),
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len +
+        args.output_len), ("Please ensure that max_model_len is greater than"
+                           " the sum of input_len and output_len.")
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: list[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir)),
+            ) as p:
+                llm_generate()
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = (Path(".") / "vllm_benchmark_result" /
+                           f"latency_result_{time.time()}")
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index cddfd672e7ab0..dc0ec32194866 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -724,17 +724,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
     parser.add_argument(
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
     parser.add_argument(
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
new file mode 100644
index 0000000000000..b3e24911cc982
--- /dev/null
+++ b/vllm/benchmarks/throughput.py
@@ -0,0 +1,608 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark offline inference throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from typing import Any, Optional, Union
+
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
+                                      ConversationDataset,
+                                      InstructCoderDataset, RandomDataset,
+                                      SampleRequest, ShareGPTDataset,
+                                      SonnetDataset, VisionArenaDataset)
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt, TokensPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import merge_async_iterators
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, Optional[list[RequestOutput]]]:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests.")
+    # Add the requests to the engine.
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                       multi_modal_data=request.multi_modal_data)
+            if "prompt_token_ids" in request.prompt else \
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    outputs = None
+    if not use_beam_search:
+        start = time.perf_counter()
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests,
+                               use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
+        end = time.perf_counter()
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+        assert all(
+            llm.model_config.max_model_len >= (request.prompt_len +
+                                               request.expected_output_len)
+            for request in requests), (
+                "Please ensure that max_model_len is greater than the sum of"
+                " prompt_len and expected_output_len for all requests.")
+
+        # Add the requests to the engine.
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                        multi_modal_data=request.multi_modal_data)
+                if "prompt_token_ids" in request.prompt else \
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
+                ))
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+    disable_detokenize: bool = False,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k]
+            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset.")
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
+                        default="vllm")
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--dataset-path",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize the response (i.e. do not include "
+              "detokenization time in the measurement)"))
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before the random "
+        "context in a request (default: 0).",
+    )
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to define "
+        "a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+
+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+
+def main(args: argparse.Namespace):
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    validate_args(args)
+    if args.seed is None:
+        args.seed = 0
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = get_requests(args, tokenizer)
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
+                ))
+        else:
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
+                args.disable_detokenize)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.hf_max_batch_size, args.trust_remote_code,
+                              args.disable_detokenize)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/collect_env.py b/vllm/collect_env.py
similarity index 93%
rename from collect_env.py
rename to vllm/collect_env.py
index 0ec9d4cae4ba7..9cfceb7c45cc5 100644
--- a/collect_env.py
+++ b/vllm/collect_env.py
@@ -105,8 +105,14 @@ def run(command):
     else:
         enc = locale.getpreferredencoding()
     output = raw_output.decode(enc)
+    if command == 'nvidia-smi topo -m':
+        # don't remove the leading whitespace of `nvidia-smi topo -m`
+        #   because they are meaningful
+        output = output.rstrip()
+    else:
+        output = output.strip()
     err = raw_err.decode(enc)
-    return rc, output.strip(), err.strip()
+    return rc, output, err.strip()
 
 
 def run_and_read_all(run_lambda, command):
@@ -276,13 +282,21 @@ def get_vllm_version():
 
     if __version__ == "dev":
         return "N/A (dev)"
-
-    if len(__version_tuple__) == 4: # dev build
-        git_sha = __version_tuple__[-1][1:] # type: ignore
-        return f"{__version__} (git sha: {git_sha}"
-
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith('g'):
+        # it's a dev build
+        if '.' in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split('.')[0][1:]
+            date = version_str.split('.')[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
     return __version__
 
+
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
     return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
@@ -482,16 +496,30 @@ def get_pip_packages(run_lambda, patterns=None):
     if patterns is None:
         patterns = DEFAULT_PIP_PATTERNS
 
-    # People generally have `pip` as `pip` or `pip3`
-    # But here it is invoked as `python -mpip`
-    def run_with_pip(pip):
-        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+    def run_with_pip():
+        try:
+            import importlib.util
+            pip_spec = importlib.util.find_spec('pip')
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+        elif os.environ.get("UV") is not None:
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError(
+                "Could not collect pip list output (pip or uv module not available)"
+            )
+
+        out = run_and_read_all(run_lambda, cmd)
         return "\n".join(line for line in out.splitlines()
                          if any(name in line for name in patterns))
 
     pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip([sys.executable, '-mpip'])
-
+    out = run_with_pip()
     return pip_version, out
 
 
@@ -517,13 +545,12 @@ def is_xnnpack_available():
     else:
         return "N/A"
 
+
 def get_env_vars():
     env_vars = ''
-    secret_terms=('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH",
-                     "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_",
-                     "NVIDIA")
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_", "NVIDIA")
     for k, v in os.environ.items():
         if any(term in k.lower() for term in secret_terms):
             continue
@@ -534,6 +561,7 @@ def get_env_vars():
 
     return env_vars
 
+
 def get_env_info():
     run_lambda = run
     pip_version, pip_list_output = get_pip_packages(run_lambda)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d8c0c59ba9b22..c493a764f56d6 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -110,10 +110,14 @@ class CompilerManager:
         compiled_graph = self.load(graph, example_inputs, graph_index,
                                    runtime_shape)
         if compiled_graph is not None:
-            if graph_index == 0:
-                # adds some info logging for the first graph
-                logger.info("Directly load the compiled graph for shape %s "
-                            "from the cache", str(runtime_shape))  # noqa
+            if graph_index == num_graphs - 1:
+                # after loading the last graph for this shape, record the time.
+                # there can be multiple graphs due to piecewise compilation.
+                now = time.time()
+                elapsed = now - compilation_start_time
+                logger.info(
+                    "Directly load the compiled graph(s) for shape %s "
+                    "from the cache, took %.3f s", str(runtime_shape), elapsed)
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -381,8 +385,8 @@ class VllmBackend:
                 with open(filepath) as f:
                     hash_content.append(f.read())
             import hashlib
-            code_hash = hashlib.md5(
-                "\n".join(hash_content).encode()).hexdigest()
+            code_hash = hashlib.md5("\n".join(hash_content).encode(),
+                                    usedforsecurity=False).hexdigest()
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -390,7 +394,8 @@ class VllmBackend:
             factors.append(compiler_hash)
 
             # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+            hash_key = hashlib.md5(str(factors).encode(),
+                                   usedforsecurity=False).hexdigest()[:10]
 
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b45c694fd7f89..833be289265f1 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import contextlib
 import copy
 import hashlib
 import os
@@ -11,6 +12,7 @@ import torch._inductor.compile_fx
 import torch.fx as fx
 
 from vllm.config import VllmConfig
+from vllm.utils import is_torch_equal_or_newer
 
 
 class CompilerInterface:
@@ -139,10 +141,12 @@ class InductorAdaptor(CompilerInterface):
         from torch._inductor.codecache import torch_key
         torch_factors = torch_key()
         factors.append(torch_factors)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        self.cache_dir = cache_dir
         if disable_cache:
             return
         # redirect the cache directory to a sub-directory
@@ -155,7 +159,6 @@ class InductorAdaptor(CompilerInterface):
         triton_cache = os.path.join(cache_dir, "triton_cache")
         os.makedirs(triton_cache, exist_ok=True)
         os.environ["TRITON_CACHE_DIR"] = triton_cache
-        self.cache_dir = cache_dir
 
     def compile(
         self,
@@ -164,8 +167,7 @@ class InductorAdaptor(CompilerInterface):
         compiler_config: Dict[str, Any],
         runtime_shape: Optional[int] = None
     ) -> Tuple[Optional[Callable], Optional[Any]]:
-        from torch._inductor import config
-        current_config = config.get_config_copy()
+        current_config = {}
         from torch._inductor.compile_fx import compile_fx
 
         # disable remote cache
@@ -228,7 +230,20 @@ class InductorAdaptor(CompilerInterface):
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
-                    file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(self.cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
                     hash_str = inductor_compiled_graph._fx_graph_cache_key
                 return output
 
@@ -271,6 +286,22 @@ class InductorAdaptor(CompilerInterface):
                     "torch._inductor.codecache.FxGraphCache._check_can_cache",
                     _check_can_cache))
 
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
+
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
             compiled_graph = compile_fx(
                 graph,
                 example_inputs,
@@ -295,8 +326,14 @@ class InductorAdaptor(CompilerInterface):
         hash_str = handle[0]
 
         from torch._inductor.codecache import FxGraphCache
-        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+        with ExitStack() as exit_stack:
+            exit_stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
+
             if torch.__version__.startswith("2.5"):
                 inductor_compiled_graph = FxGraphCache._lookup_graph(
                     hash_str, example_inputs, True, False)
@@ -337,6 +374,28 @@ class InductorAdaptor(CompilerInterface):
 
         return compiled_graph
 
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        if is_torch_equal_or_newer("2.6"):
+            import torch._dynamo.utils
+            return torch._dynamo.utils.get_metrics_context()
+        else:
+            return contextlib.nullcontext()
+
 
 class EagerAdaptor(CompilerInterface):
     name = "eager"
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 0c3d8697b2375..b46f5f52244fa 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -4,8 +4,6 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
 import torch._inductor.pattern_matcher as pm
-# TODO(luka) use vllm.utils once #10836 landed
-from compressed_tensors.quantization import FP8_DTYPE
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
@@ -13,12 +11,14 @@ from torch._ops import OpOverload
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def empty_bf16(*args, **kwargs):
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 08dd8c8e1ea26..00a2e89f21aeb 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
-import importlib.metadata
 import inspect
 import json
 import types
 from typing import Any, Callable, Dict, Optional, Union
 
 import torch
-from packaging.version import Version
 from torch import fx
 
-if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+from vllm.utils import is_torch_equal_or_newer
+
+if is_torch_equal_or_newer("2.6"):
     from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
     # CustomGraphPass is not present in 2.5 or lower, import our version
diff --git a/vllm/config.py b/vllm/config.py
index 87ede1e077b8a..641b221f5d347 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,21 +4,22 @@ import ast
 import copy
 import enum
 import hashlib
-import importlib.metadata
+import inspect
 import json
+import re
 import sys
+import textwrap
 import warnings
 from collections import Counter
-from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import dataclass, field, replace
+from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
+                         replace)
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
-                    Optional, Protocol, Union)
+                    Optional, Protocol, TypeVar, Union, get_args)
 
 import torch
-from packaging.version import Version
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
@@ -29,7 +30,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
@@ -38,11 +39,13 @@ from vllm.transformers_utils.config import (
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
-from vllm.transformers_utils.utils import is_s3
+from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
+                        random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
     from ray.util.placement_group import PlacementGroup
 
     from vllm.executor.executor_base import ExecutorBase
@@ -51,11 +54,16 @@ if TYPE_CHECKING:
     from vllm.model_executor.model_loader.loader import BaseModelLoader
     from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
         BaseTokenizerGroup)
+
+    ConfigType = type[DataclassInstance]
 else:
     QuantizationConfig = None
+    ConfigType = type
 
 logger = init_logger(__name__)
 
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 _DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
@@ -105,6 +113,94 @@ class ModelImpl(str, enum.Enum):
     TRANSFORMERS = "transformers"
 
 
+def get_attr_docs(cls: type[Any]) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+
+    https://davidism.com/mit-license/
+    """
+
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+
+        for b in iterator:
+            yield a, b
+            a = b
+
+    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+
+    if not isinstance(cls_node, ast.ClassDef):
+        raise TypeError("Given object was not a class.")
+
+    out = {}
+
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+
+        doc = inspect.cleandoc(b.value.value)
+
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+
+            out[target.id] = doc
+
+    return out
+
+
+def config(cls: ConfigT) -> ConfigT:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+    """
+    if not is_dataclass(cls):
+        raise TypeError("The decorated class must be a dataclass.")
+    attr_docs = get_attr_docs(cls)
+    for f in fields(cls):
+        if f.init and f.default is MISSING and f.default_factory is MISSING:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a default value."
+            )
+        if f.name not in attr_docs:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a docstring.")
+    return cls
+
+
+def get_field(cls: ConfigType, name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields.get(name)
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -172,6 +268,9 @@ class ModelConfig:
             Defaults to True.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running
+            `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
@@ -218,12 +317,20 @@ class ModelConfig:
         factors.append(self.quantization)
         factors.append(self.revision)
         factors.append(self.code_revision)
+        factors.append(self.max_model_len)
+        factors.append(self.max_logprobs)
+        factors.append(self.disable_sliding_window)
         factors.append(self.trust_remote_code)
+        factors.append(self.mm_processor_kwargs)
+        factors.append(self.generation_config)
+        factors.append(self.model_impl)
+        factors.append(self.override_generation_config)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
-        # rope cos/sin cache depends on the max_position_embeddings
-        factors.append(
-            getattr(self.hf_config, "max_position_embeddings", "None"))
+        # hf_config can control how the model looks!
+        factors.append(self.hf_config.to_json_string())
+        str_factors = str(factors)
+        assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __init__(
@@ -252,9 +359,10 @@ class ModelConfig:
         disable_cascade_attn: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, list[str]]] = None,
-        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+        limit_mm_per_prompt: Optional[dict[str, int]] = None,
         use_async_output_proc: bool = True,
         config_format: ConfigFormat = ConfigFormat.AUTO,
+        hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         disable_mm_preprocessor_cache: bool = False,
@@ -266,9 +374,13 @@ class ModelConfig:
         override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
-        self.model = model
+        self.model = maybe_model_redirect(model)
+        self.tokenizer = maybe_model_redirect(tokenizer)
+
         self.hf_config_path = hf_config_path
-        self.tokenizer = tokenizer
+        if isinstance(hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(hf_config_path)
+
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
         self.allowed_local_media_path = allowed_local_media_path
@@ -312,8 +424,8 @@ class ModelConfig:
             ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "module was not found. See "
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
                 "for instructions on how to install it.")
 
         # The tokenizer version is consistent with the model version by default.
@@ -332,8 +444,10 @@ class ModelConfig:
 
         from vllm.platforms import current_platform
 
-        if self.enable_sleep_mode and not current_platform.is_cuda():
-            raise ValueError("Sleep mode is only supported on CUDA devices.")
+        if (self.enable_sleep_mode
+                and not current_platform.is_sleep_mode_available()):
+            raise ValueError(
+                "Sleep mode is not supported on current platform.")
 
         hf_config = get_config(self.hf_config_path or self.model,
                                trust_remote_code, revision, code_revision,
@@ -349,9 +463,11 @@ class ModelConfig:
         self.hf_config = hf_config
 
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.attention_chunk_size = getattr(self.hf_text_config,
+                                            "attention_chunk_size", None)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, revision)
+            self.model, hf_token=hf_token, revision=revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
@@ -406,6 +522,7 @@ class ModelConfig:
 
         self.is_attention_free = self._init_attention_free()
         self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -465,7 +582,7 @@ class ModelConfig:
                 self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
-        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
+        self, limit_mm_per_prompt: Optional[dict[str, int]]
     ) -> Optional["MultiModalConfig"]:
         if self.registry.is_multimodal_model(self.architectures):
             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
@@ -495,6 +612,15 @@ class ModelConfig:
                     if getattr(user_config, k) is None:
                         setattr(user_config, k, v)
 
+            if self.is_matryoshka:
+                if user_config.normalize is None:
+                    user_config.normalize = True
+                elif not user_config.normalize:
+                    raise ValueError(
+                        "`normalize` must be enabled (set to True) "
+                        "for models that are compatible with "
+                        "Matryoshka Representation.")
+
             return user_config
 
         return None
@@ -505,6 +631,10 @@ class ModelConfig:
     def _init_is_hybrid(self) -> bool:
         return self.registry.is_hybrid_model(self.architectures)
 
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         return self.registry.model_has_inner_state(self.architectures)
 
@@ -625,7 +755,8 @@ class ModelConfig:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4"
+            "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
+            "gptq_bitblas"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
@@ -674,11 +805,19 @@ class ModelConfig:
             self.max_seq_len_to_capture = self.max_model_len
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
+        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS
+                and not self.enforce_eager and current_platform.is_rocm()):
+            logger.warning(
+                "CUDA graph is not supported for %s on ROCm yet, fallback "
+                "to the eager mode.", self.hf_config.model_type)
+            self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:
         """
-        The current version of bitsandbytes (0.44.0) with 8-bit models does not
+        The current version of bitsandbytes (0.45.3) with 8-bit models does not
         yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
         """
         is_bitsandbytes = self.quantization == "bitsandbytes"
         has_quantization_config = (getattr(self.hf_config,
@@ -693,8 +832,9 @@ class ModelConfig:
                 not self.enforce_eager,
         ]):
             logger.warning(
-                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                 "fallback to the eager mode.")
+
             self.enforce_eager = True
 
     def _verify_with_expert_parallelism(self) -> None:
@@ -749,6 +889,12 @@ class ModelConfig:
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
+
+        if parallel_config.distributed_executor_backend == "external_launcher":
+            assert self.seed is not None, (
+                "Seed must be set when using external launcher backend to "
+                "make sure sampling results are the same across workers.")
+
         total_num_attention_heads = getattr(self.hf_text_config,
                                             "num_attention_heads", 0)
         tensor_parallel_size = parallel_config.tensor_parallel_size
@@ -800,10 +946,18 @@ class ModelConfig:
 
     @property
     def is_deepseek_mla(self) -> bool:
-        return (hasattr(self.hf_text_config, "model_type")) \
-                and (self.hf_text_config.model_type in \
-                    ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\
-                and (self.hf_text_config.kv_lora_rank is not None)
+        if not hasattr(self.hf_text_config, "model_type"):
+            return False
+        elif self.hf_text_config.model_type in \
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == 'eagle':
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return self.hf_text_config.model.model_type in \
+                    ('deepseek_v2', 'deepseek_v3') \
+                and self.hf_text_config.kv_lora_rank is not None
+        return False
 
     def get_head_size(self) -> int:
         # TODO remove hard code
@@ -857,6 +1011,14 @@ class ModelConfig:
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+
+            raise RuntimeError("Couldn't determine number of kv heads")
+
         if self.is_attention_free:
             return 0
 
@@ -925,7 +1087,9 @@ class ModelConfig:
         # This function relies on 'layers_block_type' in hf_config,
         # for w/o this attribute, we will need to have workarounds like so
         attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and not self.is_attention_free
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
         start, end = self.get_layers_start_end_indices(parallel_config)
 
         if is_transformer:
@@ -936,27 +1100,39 @@ class ModelConfig:
             # Note that this code assumes there
             # is only one type of attention-free block type.
             return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
         else:
-            # Hybrid model
+            # Hybrid model Jamba
             layers_block_type_value = getattr(self.hf_config,
                                               "layers_block_type", None)
-            if layers_block_type_value is None:
-                raise ValueError("The model is an hybrid without a "
-                                 "layers_block_type in the hf_config, "
-                                 "cannot determine the num of "
-                                 f"{block_type.value} layers")
+            if layers_block_type_value is not None:
+                if hasattr(self.hf_text_config,
+                           "model_type") and (self.hf_text_config.model_type
+                                              == "zamba2"):
+                    if attn_block_type:
+                        return sum(t == "hybrid"
+                                   for t in layers_block_type_value[start:end])
+                    else:
+                        return self.get_num_layers(parallel_config)
+                return sum(t == block_type.value
+                           for t in layers_block_type_value[start:end])
 
-            if hasattr(self.hf_text_config,
-                       "model_type") and (self.hf_text_config.model_type
-                                          == "zamba2"):
-                if attn_block_type:
-                    return sum(t == "hybrid"
-                               for t in layers_block_type_value[start:end])
-                else:
-                    return self.get_num_layers(parallel_config)
+            # Hybrid model Minimax
+            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
+            if attn_type_list:
+                return sum(t == 1 for t in attn_type_list[start:end])
 
-            return sum(t == block_type.value
-                       for t in layers_block_type_value[start:end])
+            if layers_block_type_value is None and attn_type_list is None:
+                raise ValueError(
+                    "The model is an hybrid without a"
+                    "layers_block_type or an attn_type_list in the hf_config,"
+                    "cannot determine the num of "
+                    f"{block_type.value} layers")
+
+            return sum(t == 1 for t in attn_type_list[start:end])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
@@ -1069,24 +1245,76 @@ class ModelConfig:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_v1_compatible(architectures)
 
+    @property
+    def is_matryoshka(self) -> bool:
+        return (hasattr(self.hf_config, "matryoshka_dimensions")
+                or getattr(self.hf_config, "is_matryoshka", False))
 
+
+BlockSize = Literal[1, 8, 16, 32, 64, 128]
+CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
+PrefixCachingHashAlgo = Literal["builtin", "sha256"]
+
+
+@config
+@dataclass
 class CacheConfig:
-    """Configuration for the KV cache.
+    """Configuration for the KV cache."""
 
-    Args:
-        block_size: Size of a cache block in number of tokens.
-        gpu_memory_utilization: Fraction of GPU memory to use for the
-            vLLM execution.
-        swap_space: Size of the CPU swap space per GPU (in GiB).
-        cache_dtype: Data type for kv cache storage.
-        is_attention_free: Whether the model is attention-free.
-        num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
-            profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache. Can not work with
-            prefix caching enabled.
-        enable_prefix_caching: Whether to enable prefix caching.
-        cpu_offload_gb: Size of the CPU offload buffer in GiB.
+    block_size: Optional[BlockSize] = None
+    """Size of a contiguous cache block in number of tokens. This is ignored on
+    neuron devices and set to `--max-model-len`. On CUDA devices, only block
+    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
     """
+    gpu_memory_utilization: float = 0.9
+    """The fraction of GPU memory to be used for the model executor, which can
+    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+    utilization. If unspecified, will use the default value of 0.9. This is a
+    per-instance limit, and only applies to the current vLLM instance. It does
+    not matter if you have another vLLM instance running on the same GPU. For
+    example, if you have two vLLM instances running on the same GPU, you can
+    set the GPU memory utilization to 0.5 for each instance."""
+    swap_space: float = 4
+    """Size of the CPU swap space per GPU (in GiB)."""
+    cache_dtype: CacheDType = "auto"
+    """Data type for kv cache storage. If "auto", will use model data type.
+    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+    fp8 (=fp8_e4m3)."""
+    is_attention_free: bool = False
+    """Whether the model is attention-free. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    num_gpu_blocks_override: Optional[int] = None
+    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
+    if specified. Does nothing if `None`. Used for testing preemption."""
+    sliding_window: Optional[int] = None
+    """Sliding window size for the KV cache. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    enable_prefix_caching: Optional[bool] = None
+    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
+    default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Set the hash algorithm for prefix caching:\n
+    - "builtin" is Python's built-in hash.\n
+    - "sha256" is collision resistant but with certain overheads."""
+    cpu_offload_gb: float = 0
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    """
+    calculate_kv_scales: bool = False
+    """This enables dynamic calculation of `k_scale` and `v_scale` when
+    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
+    checkpoint if available. Otherwise, the scales will default to 1.0."""
+
+    # Will be set after profiling.
+    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for GPU memory."""
+    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for CPU memory."""
 
     def compute_hash(self) -> str:
         """
@@ -1103,44 +1331,17 @@ class CacheConfig:
         factors: list[Any] = []
         factors.append(self.cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        swap_space: float,
-        cache_dtype: str,
-        is_attention_free: bool = False,
-        num_gpu_blocks_override: Optional[int] = None,
-        sliding_window: Optional[int] = None,
-        enable_prefix_caching: bool = False,
-        cpu_offload_gb: float = 0,
-        calculate_kv_scales: Optional[bool] = None,
-    ) -> None:
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * GiB_bytes
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.cache_dtype = cache_dtype
-        self.is_attention_free = is_attention_free
-        self.sliding_window = sliding_window
-        self.enable_prefix_caching = enable_prefix_caching
-        self.cpu_offload_gb = cpu_offload_gb
-        self.calculate_kv_scales = calculate_kv_scales
+    def __post_init__(self) -> None:
+        self.swap_space_bytes = self.swap_space * GiB_bytes
+
         self._verify_args()
         self._verify_cache_dtype()
         self._verify_prefix_caching()
 
-        # Will be set after profiling.
-        self.num_gpu_blocks: Optional[int] = None
-        self.num_cpu_blocks: Optional[int] = None
-
-        # Set calculate_kv_scales to False if the value is unset.
-        if self.calculate_kv_scales is None:
-            self.calculate_kv_scales = False
-
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
@@ -1159,7 +1360,7 @@ class CacheConfig:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+        elif self.cache_dtype in get_args(CacheDType):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
@@ -1177,6 +1378,13 @@ class CacheConfig:
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")
 
+        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
+                not in get_args(PrefixCachingHashAlgo)):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be one of "
+                f"{get_args(PrefixCachingHashAlgo)}.")
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
@@ -1196,20 +1404,26 @@ class CacheConfig:
             logger.warning("Possibly too large swap space. %s", msg)
 
 
+PoolType = Literal["ray"]
+
+
+@config
 @dataclass
 class TokenizerPoolConfig:
-    """Configuration for the tokenizer pool.
+    """Configuration for the tokenizer pool."""
 
-    Args:
-        pool_size: Number of tokenizer workers in the pool.
-        pool_type: Type of the pool.
-        extra_config: Additional config for the pool.
-            The way the config will be used depends on the
-            pool type.
-    """
-    pool_size: int
-    pool_type: Union[str, type["BaseTokenizerGroup"]]
-    extra_config: dict
+    pool_size: int = 0
+    """Number of tokenizer workers in the pool to use for asynchronous
+    tokenization. If 0, will use synchronous tokenization."""
+
+    pool_type: Union[PoolType, type["BaseTokenizerGroup"]] = "ray"
+    """Type of tokenizer pool to use for asynchronous tokenization. Ignored if
+    tokenizer_pool_size is 0."""
+
+    extra_config: dict = field(default_factory=dict)
+    """Additional config for the pool. The way the config will be used depends
+    on the pool type. This should be a JSON string that will be parsed into a
+    dictionary. Ignored if tokenizer_pool_size is 0."""
 
     def compute_hash(self) -> str:
         """
@@ -1226,7 +1440,8 @@ class TokenizerPoolConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -1239,7 +1454,7 @@ class TokenizerPoolConfig:
     @classmethod
     def create_config(
         cls, tokenizer_pool_size: int,
-        tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]],
+        tokenizer_pool_type: Union[PoolType, type["BaseTokenizerGroup"]],
         tokenizer_pool_extra_config: Optional[Union[str, dict]]
     ) -> Optional["TokenizerPoolConfig"]:
         """Create a TokenizerPoolConfig from the given parameters.
@@ -1280,47 +1495,51 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     RUNAI_STREAMER = "runai_streamer"
+    RUNAI_STREAMER_SHARDED = "runai_streamer_sharded"
     FASTSAFETENSORS = "fastsafetensors"
 
 
+@config
 @dataclass
 class LoadConfig:
-    """
-        download_dir: Directory to download and load the weights, default to the
-            default cache directory of huggingface.
-        load_format: The format of the model weights to load:
-            "auto" will try to load the weights in the safetensors format and
-                fall back to the pytorch bin format if safetensors format is
-                not available.
-            "pt" will load the weights in the pytorch bin format.
-            "safetensors" will load the weights in the safetensors format.
-            "npcache" will load the weights in pytorch format and store
-                a numpy cache to speed up the loading.
-            "dummy" will initialize the weights with random values, which is
-                mainly for profiling.
-            "tensorizer" will use CoreWeave's tensorizer library for
-                fast weight loading.
-            "bitsandbytes" will load nf4 type weights.
-            "sharded_state" will load weights from pre-sharded checkpoint files,
-                supporting efficient loading of tensor-parallel models.
-            "gguf" will load weights from GGUF format files.
-            "mistral" will load weights from consolidated safetensors files used
-                by Mistral models.
-            "runai_streamer" will load weights from RunAI streamer format files.
-        model_loader_extra_config: The extra config for the model loader.
-        ignore_patterns: The list of patterns to ignore when loading the model.
-            Default to "original/**/*" to avoid repeated loading of llama's
-            checkpoints.
-        use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
-            loading. Default to True
-    """
+    """Configuration for loading the model weights."""
 
-    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+    load_format: Union[str, LoadFormat,
+                       "BaseModelLoader"] = LoadFormat.AUTO.value
+    """The format of the model weights to load:\n
+    - "auto" will try to load the weights in the safetensors format and fall
+    back to the pytorch bin format if safetensors format is not available.\n
+    - "pt" will load the weights in the pytorch bin format.\n
+    - "safetensors" will load the weights in the safetensors format.\n
+    - "npcache" will load the weights in pytorch format and store a numpy cache
+    to speed up the loading.\n
+    - "dummy" will initialize the weights with random values, which is mainly
+    for profiling.\n
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
+    loading. See the Tensorize vLLM Model script in the Examples section for
+    more information.\n
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
+    Streamer.\n
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
+    supporting efficient loading of tensor-parallel models.\n
+    - "gguf" will load weights from GGUF format files (details specified in
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+    - "mistral" will load weights from consolidated safetensors files used by
+    Mistral models."""
     download_dir: Optional[str] = None
-    model_loader_extra_config: Optional[Union[str, dict]] = field(
-        default_factory=dict)
+    """Directory to download and load the weights, default to the default
+    cache directory of Hugging Face."""
+    model_loader_extra_config: dict = field(default_factory=dict)
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format. This should be a JSON string that
+    will be parsed into a dictionary."""
     ignore_patterns: Optional[Union[list[str], str]] = None
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
     use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
 
     def compute_hash(self) -> str:
         """
@@ -1337,14 +1556,11 @@ class LoadConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
-        model_loader_extra_config = self.model_loader_extra_config or {}
-        if isinstance(model_loader_extra_config, str):
-            self.model_loader_extra_config = json.loads(
-                model_loader_extra_config)
         if isinstance(self.load_format, str):
             load_format = self.load_format.lower()
             self.load_format = LoadFormat(load_format)
@@ -1357,59 +1573,80 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
+DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
+
+
+@config
 @dataclass
 class ParallelConfig:
     """Configuration for the distributed execution."""
 
-    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
-    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
-    data_parallel_size: int = 1  # Number of data parallel groups.
-    data_parallel_rank: int = 0  # Rank of the data parallel group.
-    # IP of the data parallel master.
+    pipeline_parallel_size: int = 1
+    """Number of pipeline parallel groups."""
+    tensor_parallel_size: int = 1
+    """Number of tensor parallel groups."""
+    data_parallel_size: int = 1
+    """Number of data parallel groups. MoE layers will be sharded according to
+    the product of the tensor parallel size and data parallel size."""
+    data_parallel_rank: int = 0
+    """Rank of the data parallel group."""
+    data_parallel_rank_local: Optional[int] = None
+    """Local rank of the data parallel group, defaults to global rank."""
     data_parallel_master_ip: str = "127.0.0.1"
-    data_parallel_master_port: int = 29500  # Port of the data parallel master.
-    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
+    """IP of the data parallel master."""
+    data_parallel_master_port: int = 29500
+    """Port of the data parallel master."""
+    enable_expert_parallel: bool = False
+    """Use expert parallelism instead of tensor parallelism for MoE layers."""
 
-    # Maximum number of multiple batches
-    # when load model sequentially. To avoid RAM OOM when using tensor
-    # parallel and large models.
     max_parallel_loading_workers: Optional[int] = None
+    """Maximum number of parallal loading workers when loading model
+    sequentially in multiple batches. To avoid RAM OOM when using tensor
+    parallel and large models."""
 
-    # Disable the custom all-reduce kernel and fall back to NCCL.
     disable_custom_all_reduce: bool = False
+    """Disable the custom all-reduce kernel and fall back to NCCL."""
 
-    # Config for the tokenizer pool. If None, will use synchronous tokenization.
     tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
+    """Config for the tokenizer pool. If None, will use synchronous
+    tokenization."""
 
-    # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
     ray_workers_use_nsight: bool = False
+    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
-    # ray distributed model workers placement group.
     placement_group: Optional["PlacementGroup"] = None
+    """ray distributed model workers placement group."""
 
-    # Backend to use for distributed model
-    # workers, either "ray" or "mp" (multiprocessing). If the product
-    # of pipeline_parallel_size and tensor_parallel_size is less than
-    # or equal to the number of GPUs available, "mp" will be used to
-    # keep processing on a single host. Otherwise, this will default
-    # to "ray" if Ray is installed and fail otherwise. Note that tpu
-    # and hpu only support Ray for distributed inference.
-    distributed_executor_backend: Optional[Union[str,
+    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
                                                  type["ExecutorBase"]]] = None
+    """Backend to use for distributed model
+    workers, either "ray" or "mp" (multiprocessing). If the product
+    of pipeline_parallel_size and tensor_parallel_size is less than
+    or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, this will default
+    to "ray" if Ray is installed and fail otherwise. Note that tpu
+    and hpu only support Ray for distributed inference."""
 
-    # the full name of the worker class to use. If "auto", the worker class
-    # will be determined based on the platform.
     worker_cls: str = "auto"
+    """The full name of the worker class to use. If "auto", the worker class
+    will be determined based on the platform."""
     sd_worker_cls: str = "auto"
+    """The full name of the worker class to use for speculative decofing.
+    If "auto", the worker class will be determined based on the platform."""
     worker_extension_cls: str = ""
+    """The full name of the worker extension class to use. The worker extension
+    class is dynamically inherited by the worker class. This is used to inject
+    new attributes and methods to the worker class for use in collective_rpc
+    calls."""
 
-    # world_size is TPxPP, it affects the number of workers we create.
     world_size: int = field(init=False)
-    # world_size_across_dp is TPxPPxDP, it is the size of the world
-    # including data parallelism.
+    """world_size is TPxPP, it affects the number of workers we create."""
     world_size_across_dp: int = field(init=False)
+    """world_size_across_dp is TPxPPxDP, it is the size of the world
+    including data parallelism."""
 
     rank: int = 0
+    """Global rank in distributed setup."""
 
     def get_next_dp_init_port(self) -> int:
         """
@@ -1463,16 +1700,25 @@ class ParallelConfig:
         factors: list[Any] = []
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
+        factors.append(self.enable_expert_parallel)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        self.data_parallel_size = envs.VLLM_DP_SIZE
-        self.data_parallel_rank = envs.VLLM_DP_RANK
-        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
-        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+        if self.data_parallel_size > 1:
+            # Data parallel was specified in the engine args.
+            self.data_parallel_master_port = get_open_port()
+            # TODO multi-node
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
         self.world_size_across_dp = self.world_size * self.data_parallel_size
 
         if self.distributed_executor_backend == "external_launcher":
@@ -1496,7 +1742,7 @@ class ParallelConfig:
             # current node and we aren't in a ray placement group.
 
             from vllm.executor import ray_utils
-            backend = "mp"
+            backend: DistributedExecutorBackend = "mp"
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_neuron():
                 # neuron uses single process to control multiple devices
@@ -1550,11 +1796,12 @@ class ParallelConfig:
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        if current_platform.is_rocm():
+
+        if not current_platform.use_custom_allreduce():
             self.disable_custom_all_reduce = True
             logger.info(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs.")
+                "supported on current platform.")
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
@@ -1563,84 +1810,124 @@ class ParallelConfig:
             "worker_extension_cls must be a string (qualified class name).")
 
 
+SchedulerPolicy = Literal["fcfs", "priority"]
+
+
+@config
 @dataclass
 class SchedulerConfig:
     """Scheduler configuration."""
 
-    runner_type: str = "generate"  # The runner type to launch for the model.
+    runner_type: RunnerType = "generate"
+    """The runner type to launch for the model."""
 
-    # Maximum number of tokens to be processed in a single iteration.
-    max_num_batched_tokens: int = field(default=None)  # type: ignore
+    max_num_batched_tokens: int = None  # type: ignore
+    """Maximum number of tokens to be processed in a single iteration.
 
-    # Maximum number of sequences to be processed in a single iteration.
-    max_num_seqs: int = 128
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    # Maximum length of a sequence (including prompt and generated text).
-    max_model_len: int = 8192
+    max_num_seqs: int = None  # type: ignore
+    """Maximum number of sequences to be processed in a single iteration.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
+
+    max_model_len: int = None  # type: ignore
+    """Maximum length of a sequence (including prompt and generated text). This
+    is primarily set in `ModelConfig` and that value should be manually
+    duplicated here."""
 
-    # Maximum number of sequences that can be partially prefilled concurrently
     max_num_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of sequences that can be
+    partially prefilled concurrently."""
 
-    # Maximum number of "very long prompt" sequences that can be prefilled
-    # concurrently (long is defined by long_prefill_threshold)
     max_long_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of prompts longer than
+    long_prefill_token_threshold that will be prefilled concurrently. Setting
+    this less than max_num_partial_prefills will allow shorter prompts to jump
+    the queue in front of longer prompts in some cases, improving latency."""
 
-    # calculate context length that determines which sequences are
-    # considered "long"
     long_prefill_token_threshold: int = 0
+    """For chunked prefill, a request is considered long if the prompt is
+    longer than this number of tokens."""
 
-    # The number of slots to allocate per sequence per
-    # step, beyond the known token ids. This is used in speculative
-    # decoding to store KV activations of tokens which may or may not be
-    # accepted.
     num_lookahead_slots: int = 0
+    """The number of slots to allocate per sequence per
+    step, beyond the known token ids. This is used in speculative
+    decoding to store KV activations of tokens which may or may not be
+    accepted.
+
+    NOTE: This will be replaced by speculative config in the future; it is
+    present to enable correctness tests until then."""
 
-    # Apply a delay (of delay factor multiplied by previous
-    # prompt latency) before scheduling next prompt.
     delay_factor: float = 0.0
+    """Apply a delay (of delay factor multiplied by previous
+    prompt latency) before scheduling next prompt."""
 
-    # If True, prefill requests can be chunked based
-    # on the remaining max_num_batched_tokens.
-    enable_chunked_prefill: bool = False
+    enable_chunked_prefill: bool = None  # type: ignore
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""
 
     is_multimodal_model: bool = False
+    """True if the model is multimodal."""
 
-    # NOTE: The following multimodal encoder budget will be initialized to
-    # max_num_batched_tokens and overridden in case max multimodal embedding
-    # size is larger.
-    # TODO (ywang96): Make these configurable.
-    # Multimodal encoder compute budget, only used in V1
-    max_num_encoder_input_tokens: int = field(default=None)  # type: ignore
+    # TODO (ywang96): Make this configurable.
+    max_num_encoder_input_tokens: int = field(init=False)
+    """Multimodal encoder compute budget, only used in V1.
 
-    # Multimodal encoder cache size, only used in V1
-    encoder_cache_size: int = field(default=None)  # type: ignore
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
+
+    # TODO (ywang96): Make this configurable.
+    encoder_cache_size: int = field(init=False)
+    """Multimodal encoder cache size, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    # Whether to perform preemption by swapping or
-    # recomputation. If not specified, we determine the mode as follows:
-    # We use recomputation by default since it incurs lower overhead than
-    # swapping. However, when the sequence group has multiple sequences
-    # (e.g., beam search), recomputation is not currently supported. In
-    # such a case, we use swapping instead.
     preemption_mode: Optional[str] = None
+    """Whether to perform preemption by swapping or
+    recomputation. If not specified, we determine the mode as follows:
+    We use recomputation by default since it incurs lower overhead than
+    swapping. However, when the sequence group has multiple sequences
+    (e.g., beam search), recomputation is not currently supported. In
+    such a case, we use swapping instead."""
 
     num_scheduler_steps: int = 1
+    """Maximum number of forward steps per scheduler call."""
 
-    multi_step_stream_outputs: bool = False
+    multi_step_stream_outputs: bool = True
+    """If False, then multi-step will stream outputs at the end of all steps"""
 
-    # Private API. If used, scheduler sends delta data to
-    # workers instead of an entire data. It should be enabled only
-    # when SPMD worker architecture is enabled. I.e.,
-    # VLLM_USE_RAY_SPMD_WORKER=1
     send_delta_data: bool = False
+    """Private API. If used, scheduler sends delta data to
+    workers instead of an entire data. It should be enabled only
+    when SPMD worker architecture is enabled. I.e.,
+    VLLM_USE_RAY_SPMD_WORKER=1"""
 
-    # The scheduling policy to use. "fcfs" (default) or "priority".
-    policy: str = "fcfs"
+    policy: SchedulerPolicy = "fcfs"
+    """The scheduling policy to use:\n
+    - "fcfs" means first come first served, i.e. requests are handled in order
+    of arrival.\n
+    - "priority" means requests are handled based on given priority (lower
+    value means earlier handling) and time of arrival deciding any ties)."""
 
     chunked_prefill_enabled: bool = field(init=False)
+    """True if chunked prefill is enabled."""
+
+    disable_chunked_mm_input: bool = False
+    """If set to true and chunked prefill is enabled, we do not want to
+    partially schedule a multimodal item. Only used in V1
+    This ensures that if a request has a mixed prompt
+    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
 
-    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
-    # or "mod.custom_class".
     scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
+    default scheduler. Can be a class directly or the path to a class of form
+    "mod.custom_class"."""
 
     def compute_hash(self) -> str:
         """
@@ -1657,10 +1944,23 @@ class SchedulerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
+        if self.max_model_len is None:
+            self.max_model_len = 8192
+            logger.warning(
+                "max_model_len was is not set. Defaulting to arbitrary value "
+                "of %d.", self.max_model_len)
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 128
+            logger.warning(
+                "max_num_seqs was is not set. Defaulting to arbitrary value "
+                "of %d.", self.max_num_seqs)
+
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
                 if self.num_scheduler_steps > 1:
@@ -1773,9 +2073,19 @@ class SchedulerConfig:
         return self.num_scheduler_steps > 1
 
 
+Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
+
+
+@config
+@dataclass
 class DeviceConfig:
-    device: Optional[torch.device]
-    device_type: str
+    """Configuration for the device to use for vLLM execution."""
+
+    device: Union[Device, torch.device] = "auto"
+    """Device type for vLLM execution."""
+    device_type: str = field(init=False)
+    """Device type from the current platform. This is set in
+    `__post_init__`."""
 
     def compute_hash(self) -> str:
         """
@@ -1793,19 +2103,23 @@ class DeviceConfig:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(self, device: str = "auto") -> None:
-        if device == "auto":
+    def __post_init__(self):
+        if self.device == "auto":
             # Automated device type detection
             from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
-                raise RuntimeError("Failed to infer device type")
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue.")
         else:
             # Device type is assigned explicitly
-            self.device_type = device
+            self.device_type = self.device
 
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron"]:
@@ -1817,139 +2131,113 @@ class DeviceConfig:
             self.device = torch.device(self.device_type)
 
 
+SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
+                            "draft_model"]
+SpeculativeAcceptanceMethod = Literal["rejection_sampler",
+                                      "typical_acceptance_sampler"]
+
+
+@config
 @dataclass
 class SpeculativeConfig:
-    """
-    Configuration for speculative decoding.
-    Configurable parameters include:
-    - General Speculative Decoding Control:
-        - num_speculative_tokens (int): The number of speculative
-            tokens, if provided. It will default to the number in the draft
-            model config if present, otherwise, it is required.
-        - model (Optional[str]): The name of the draft model, eagle head,
-            or additional weights, if provided.
-        - method (Optional[str]): The name of the speculative method to use.
-            If users provide and set the `model` param, the speculative method
-            type will be detected automatically if possible, if `model` param
-            is not provided, the method name must be provided.
-            - Possible values:
-                - ngram
-                    Related additional configuration:
-                    - prompt_lookup_max (Optional[int]):
-                        Maximum size of ngram token window when using Ngram
-                        proposer, required when method is set to ngram.
-                    - prompt_lookup_min (Optional[int]):
-                        Minimum size of ngram token window when using Ngram
-                        proposer, if provided. Defaults to 1.
-                - eagle
-                - medusa
-                - mlp_speculator
-                - draft_model
-        - acceptance_method (str): The method to use for accepting draft
-            tokens. This can take two possible values: 'rejection_sampler' and
-            'typical_acceptance_sampler' for RejectionSampler and
-            TypicalAcceptanceSampler respectively. If not specified, it
-            defaults to 'rejection_sampler'.
-            - Possible values:
-                - rejection_sampler
-                - typical_acceptance_sampler
-                    Related additional configuration:
-                    - posterior_threshold (Optional[float]):
-                        A threshold value that sets a lower bound on the
-                        posterior probability of a token in the target model
-                        for it to be accepted. This threshold is used only
-                        when we use the TypicalAcceptanceSampler for token
-                        acceptance.
-                    - posterior_alpha (Optional[float]):
-                        Scaling factor for entropy-based threshold, applied
-                        when using TypicalAcceptanceSampler.
-        - draft_tensor_parallel_size (Optional[int]): The degree of the tensor
-            parallelism for the draft model. Can only be 1 or the same as the
-            target model's tensor parallel size.
-        - disable_logprobs (bool): If set to True, token log probabilities are
-            not returned during speculative decoding. If set to False, token
-            log probabilities are returned according to the log probability
-            settings in SamplingParams. If not specified, it defaults to True.
+    """Configuration for speculative decoding."""
 
-    - Draft Model Configuration:
-        - quantization (Optional[str]): Quantization method that was used to
-            quantize the draft model weights. If None, we assume the
-            model weights are not quantized. Note that it only takes effect
-            when using the draft model-based speculative method.
-        - max_model_len (Optional[int]): The maximum model length of the
-            draft model. Used when testing the ability to skip
-            speculation for some sequences.
-        - revision: The specific model version to use for the draft model. It
-            can be a branch name, a tag name, or a commit id. If unspecified,
-            will use the default version.
-        - code_revision: The specific revision to use for the draft model code
-            on Hugging Face Hub. It can be a branch name, a tag name, or a
-            commit id. If unspecified, will use the default version.
-
-    - Advanced Control:
-        - disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
-            batch expansion for scoring proposals. If not specified, it
-            defaults to False.
-        - disable_by_batch_size (Optional[int]): Disable speculative decoding
-            for new incoming requests when the number of enqueued requests is
-            larger than this value, if provided.
-
-    Although the parameters above are structured hierarchically, there is no
-    need to nest them during configuration.
-
-    Non-configurable internal parameters include:
-    - Model Configuration:
-        - target_model_config (ModelConfig): The configuration of the target
-            model.
-        - draft_model_config (ModelConfig): The configuration of the draft
-            model initialized internal.
-    - Parallelism Configuration:
-        - target_parallel_config (ParallelConfig): The parallel configuration
-            for the target model.
-        - draft_parallel_config (ParallelConfig): The parallel configuration
-            for the draft model initialized internal.
-    - Execution Control:
-        - enable_chunked_prefill (bool): Whether vLLM is configured to use
-            chunked prefill or not. Used for raising an error since it's not
-            yet compatible with speculative decode.
-        - disable_log_stats (bool): Whether to disable the periodic printing of
-            stage times in speculative decoding.
-    """
-    # speculative configs from cli args
+    # General speculative decoding control
     num_speculative_tokens: int = field(default=None,
                                         init=True)  # type: ignore
-    method: Optional[str] = None
-    acceptance_method: str = "rejection_sampler"
-    draft_tensor_parallel_size: Optional[int] = None
-    disable_logprobs: bool = True
-
+    """The number of speculative tokens, if provided. It will default to the
+    number in the draft model config if present, otherwise, it is required."""
     model: Optional[str] = None
-    quantization: Optional[str] = None
-    max_model_len: Optional[int] = None
-    revision: Optional[str] = None
-    code_revision: Optional[str] = None
+    """The name of the draft model, eagle head, or additional weights, if
+    provided."""
+    method: Optional[SpeculativeMethod] = None
+    """The name of the speculative method to use. If users provide and set the
+    `model` param, the speculative method type will be detected automatically
+    if possible, if `model` param is not provided, the method name must be
+    provided.
 
+    If using `ngram` method, the related configuration `prompt_lookup_max` and
+    `prompt_lookup_min` should be considered."""
+    acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
+    """The method to use for accepting draft tokens:\n
+    - "rejection_sampler" maps to `RejectionSampler`.\n
+    - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
+
+    If using `typical_acceptance_sampler`, the related configuration
+    `posterior_threshold` and `posterior_alpha` should be considered."""
+    draft_tensor_parallel_size: Optional[int] = None
+    """The degree of the tensor parallelism for the draft model. Can only be 1
+    or the same as the target model's tensor parallel size."""
+    disable_logprobs: bool = True
+    """If set to True, token log probabilities are not returned during
+    speculative decoding. If set to False, token log probabilities are returned
+    according to the log probability settings in SamplingParams."""
+
+    # Draft model configuration
+    quantization: Optional[str] = None
+    """Quantization method that was used to quantize the draft model weights.
+    If `None`, we assume the model weights are not quantized. Note that it only
+    takes effect when using the draft model-based speculative method."""
+    max_model_len: Optional[int] = None
+    """The maximum model length of the draft model. Used when testing the
+    ability to skip speculation for some sequences."""
+    revision: Optional[str] = None
+    """The specific model version to use for the draft model. It can be a
+    branch name, a tag name, or a commit id. If unspecified, will use the
+    default version."""
+    code_revision: Optional[str] = None
+    """The specific revision to use for the draft model code on Hugging Face
+    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
+    will use the default version."""
+
+    # Advanced control
     disable_mqa_scorer: bool = False
+    """Disable the MQA scorer and fall back to batch expansion for scoring
+    proposals."""
     disable_by_batch_size: Optional[int] = None
+    """Disable speculative decoding for new incoming requests when the number
+    of enqueued requests is larger than this value, if provided."""
+
+    # Ngram proposer configuration
     prompt_lookup_max: Optional[int] = None
+    """Maximum size of ngram token window when using Ngram proposer, required
+    when method is set to ngram."""
     prompt_lookup_min: Optional[int] = None
+    """Minimum size of ngram token window when using Ngram proposer, if
+    provided. Defaults to 1."""
+
+    # Typical acceptance sampler configuration
     posterior_threshold: Optional[float] = None
+    """A threshold value that sets a lower bound on the posterior probability
+    of a token in the target model for it to be accepted. This threshold is
+    used only when we use the `TypicalAcceptanceSampler` for token acceptance.
+    """
     posterior_alpha: Optional[float] = None
+    """Scaling factor for entropy-based threshold, applied when using
+    `TypicalAcceptanceSampler`."""
 
     # required configuration params passed from engine
     target_model_config: ModelConfig = field(default=None,
                                              init=True)  # type: ignore
+    """The configuration of the target model."""
     target_parallel_config: ParallelConfig = field(default=None,
                                                    init=True)  # type: ignore
+    """The parallel configuration for the target model."""
     enable_chunked_prefill: bool = field(default=None,
                                          init=True)  # type: ignore
+    """Whether vLLM is configured to use chunked prefill or not. Used for
+    raising an error since it's not yet compatible with speculative decode."""
     disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+    """Whether to disable the periodic printing of stage times in speculative
+    decoding."""
 
     # params generated in the post-init stage
     draft_model_config: ModelConfig = field(default=None,
                                             init=True)  # type: ignore
+    """The configuration of the draft model initialized internal."""
     draft_parallel_config: ParallelConfig = field(default=None,
                                                   init=True)  # type: ignore
+    """The parallel configuration for the draft model initialized internal."""
 
     def compute_hash(self) -> str:
         """
@@ -1966,7 +2254,8 @@ class SpeculativeConfig:
         # no factors to consider.
         # spec decode does not use `torch.compile` yet.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -1988,19 +2277,19 @@ class SpeculativeConfig:
 
     def __post_init__(self):
 
-        # Note: After next release, the method parameter will be used to
-        # specify the speculative method, which helps to extend the
-        # configuration of non-model-based proposers, and the model parameter
-        # will be used when the draft model or head is needed.
-        # If users do not specify the method, the speculative method will
-        # be detected automatically if possible. If the speculative method can
-        # not be detected, it will be considered as the draft-model-based
-        # method by default.
+        # Note: "method" is a new parameter that helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
+        # will be used to set the draft model, eagle head, or additional weight
+        # when needed. If users do not specify "method", the speculative method
+        # will be detected automatically if possible. If the speculative method
+        # can not be detected, it will be considered as the "draft_model" by
+        # default.
 
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
             # mtp acceleration for more models besides deepseek_v3
-            if self.target_model_config.hf_text_config.model_type \
+            if self.target_model_config and \
+                self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3":
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
@@ -2010,8 +2299,8 @@ class SpeculativeConfig:
                 raise ValueError("num_speculative_tokens was provided without "
                                  "speculative model.")
 
-        # Automatically configure the ngram method during configuration
-        # refactoring to ensure a smooth transition.
+        # Automatically configure the method for ngram when "model" is used
+        # instead of "method"
         if self.method is None and (self.model is not None
                                     and self.model in ("ngram", "[ngram]")):
             self.method = "ngram"
@@ -2081,7 +2370,9 @@ class SpeculativeConfig:
                 )
 
                 # Automatically detect the method
-                if "eagle-" in self.draft_model_config.model.lower():
+                if self.method == 'eagle':
+                    pass
+                elif "eagle-" in self.draft_model_config.model.lower():
                     self.method = "eagle"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"
@@ -2093,9 +2384,10 @@ class SpeculativeConfig:
 
                 # Replace hf_config for EAGLE draft_model
                 if self.method == "eagle":
-                    if self.enable_chunked_prefill:
+                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                         raise ValueError(
-                            "Chunked prefill and EAGLE are not compatible.")
+                            "Chunked prefill and EAGLE are not compatible "
+                            "when using V0.")
 
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
@@ -2300,12 +2592,10 @@ class SpeculativeConfig:
         return self.num_speculative_tokens
 
     def __repr__(self) -> str:
-        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
-            draft_model = "ngram"
-        else:
-            draft_model = self.draft_model_config.model
+        method = self.method
+        model = None if method == "ngram" else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
-        return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
+        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
 @dataclass
@@ -2341,7 +2631,8 @@ class LoRAConfig:
         factors.append(self.lora_extra_vocab_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2367,9 +2658,9 @@ class LoRAConfig:
                 f"max_loras ({self.max_loras})")
 
     def verify_with_cache_config(self, cache_config: CacheConfig):
-        # TODO LoRA supports CPU offload.
-        if cache_config.cpu_offload_gb > 0:
-            raise ValueError("CPU offload is not supported with LoRA yet.")
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
@@ -2377,12 +2668,10 @@ class LoRAConfig:
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
 
-    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if scheduler_config.chunked_prefill_enabled:
-            logger.warning("LoRA with chunked prefill is still experimental "
-                           "and may be unstable.")
+    def verify_lora_support(self):
+        if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
+            raise ValueError(
+                "V1 LoRA does not support long LoRA, please use V0.")
 
 
 @dataclass
@@ -2407,7 +2696,8 @@ class PromptAdapterConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2428,13 +2718,19 @@ class PromptAdapterConfig:
                                                 self.prompt_adapter_dtype)
 
 
+@config
 @dataclass
 class MultiModalConfig:
     """Controls the behavior of multimodal models."""
 
-    limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
+    limit_per_prompt: dict[str, int] = field(default_factory=dict)
     """
     The maximum number of input items allowed per prompt for each modality.
+    This should be a JSON string that will be parsed into a dictionary.
+    Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    ``{"images": 16, "videos": 2}``
     """
 
     def compute_hash(self) -> str:
@@ -2452,21 +2748,24 @@ class MultiModalConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
         """
         Get the maximum number of input items allowed per prompt
         for the given modality.
-
-        If not set by the user, this defaults to `1`.
         """
-        return self.limit_per_prompt.get(modality, 1)
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )
 
     # TODO: Add configs to init vision tower or not.
 
 
+@config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
@@ -2518,7 +2817,8 @@ class PoolerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
@@ -2564,6 +2864,13 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if config.model_type == "plamo2":
+                logger.info(
+                    "For PLaMo2, we cast models to bfloat16 instead of using "
+                    "float16 by default. This is because float16 does not work."
+                )
+                torch_dtype = torch.bfloat16
+
             from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
@@ -2593,6 +2900,11 @@ def _get_and_verify_dtype(
                     "using float16 by default. Please specify `dtype` if you "
                     "want to use float16.")
                 torch_dtype = torch.bfloat16
+        elif dtype == "float16" and config.model_type == "plamo2":
+            logger.warning(
+                "For PLaMo2, using float16 is unstable and might cause "
+                "unexpected behavior. Please use bfloat16 or float32 instead.")
+            torch_dtype = torch.float16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")
@@ -2655,6 +2967,10 @@ def _get_and_verify_max_len(
             max_len_key = key if max_len < derived_max_model_len \
                 else max_len_key
             derived_max_model_len = min(derived_max_model_len, max_len)
+    # For Command-R / Cohere, Cohere2 / Aya Vision models
+    if tmp_max_len := getattr(hf_config, "model_max_length", None):
+        max_len_key = "model_max_length"
+        derived_max_model_len = tmp_max_len
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
@@ -2774,15 +3090,28 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
+GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
+                                  "xgrammar"]
+GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"]
+
+
+@config
 @dataclass
 class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine"""
+    """Dataclass which contains the decoding strategy of the engine."""
 
-    # Which guided decoding algo to use.
-    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: Union[
+        GuidedDecodingBackendV0,
+        GuidedDecodingBackendV1] = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    """Which engine will be used for guided decoding (JSON schema / regex etc)
+    by default. With "auto", we will make opinionated choices based on request
+    contents and what the backend libraries currently support, so the behavior
+    is subject to change in each release."""
 
     reasoning_backend: Optional[str] = None
+    """Select the reasoning parser depending on the model that you're using.
+    This is used to parse the reasoning content into OpenAI API format.
+    Required for `--enable-reasoning`."""
 
     def compute_hash(self) -> str:
         """
@@ -2799,21 +3128,17 @@ class DecodingConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
-        v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar'
-        ]
-        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
-
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
         if envs.VLLM_USE_V1:
-            valid_guided_backends = v1_valid_guided_backends
+            valid_guided_backends = get_args(GuidedDecodingBackendV1)
         else:
-            valid_guided_backends = v0_valid_guided_backends
+            valid_guided_backends = get_args(GuidedDecodingBackendV0)
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
                              f" must be one of {valid_guided_backends}")
@@ -2849,7 +3174,8 @@ class ObservabilityConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2911,7 +3237,8 @@ class KVTransferConfig(BaseModel):
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -2939,12 +3266,6 @@ class KVTransferConfig(BaseModel):
         return self.kv_connector is not None and \
             self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
 
-    @property
-    def need_kv_parallel_group(self) -> bool:
-        # for those database-based connector, vLLM does not need to create
-        # parallel group, and in that case the kv parallel size will be 1.
-        return self.kv_connector is not None and self.kv_parallel_size > 1
-
     @property
     def is_kv_producer(self) -> bool:
         return self.kv_connector is not None and \
@@ -3180,7 +3501,7 @@ class CompilationConfig(BaseModel):
         #    and it is not yet a priority. RFC here:
         #    https://github.com/vllm-project/vllm/issues/14703
 
-        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+        if is_torch_equal_or_newer("2.6"):
             KEY = 'enable_auto_functionalized_v2'
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False
@@ -3408,7 +3729,8 @@ class VllmConfig:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:
@@ -3476,8 +3798,7 @@ class VllmConfig:
         if self.lora_config:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
+            self.lora_config.verify_lora_support()
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
@@ -3520,9 +3841,10 @@ class VllmConfig:
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
-            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
+                and not envs.VLLM_USE_V1:
             logger.warning(
-                "CPU offload is not supported with `torch.compile` yet."
+                "CPU offload is not supported with `torch.compile` in v0 yet."
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
@@ -3679,7 +4001,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     try:
         _current_vllm_config = vllm_config
         yield
-    finally:
+    except Exception:
+        raise
+    else:
         logger.debug("enabled custom ops: %s",
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
@@ -3697,6 +4021,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
                 " does not support it. Please open an issue on GitHub"
                 " if you want it to be supported.",
                 vllm_config.model_config.model)
+    finally:
         _current_vllm_config = old_vllm_config
 
 
@@ -3709,3 +4034,30 @@ def get_current_vllm_config() -> VllmConfig:
         from vllm.config import VllmConfig
         return VllmConfig()
     return _current_vllm_config
+
+
+def contains_object_print(text):
+    """
+    Check if the text looks like a printed Python object, e.g.
+    contains any substring matching the pattern: "at 0xFFFFFFF>"
+    We match against 0x followed by 2-16 hex chars (there's
+    a max of 16 on a 64 bit system).
+
+    Args:
+        text (str): The text to check
+
+    Returns:
+        bool: True if a match is found, False otherwise
+    """
+    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    match = re.search(pattern, text)
+    return match is not None
+
+
+def assert_hashable(text):
+    if not contains_object_print(text):
+        return True
+    raise AssertionError(
+        f"vLLM tried to hash some configs that may have Python objects ids "
+        f"in them. This is a bug, please file an issue. "
+        f"Text being hashed: {text}")
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 0291fd9e1c88f..9ff77f14a5e84 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import gc
 import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -175,7 +176,7 @@ class CuMemAllocator:
                                          str]] = None) -> None:
         """
         Put the allocator in sleep mode.
-        All data in the memory allocation with the specified tag will be 
+        All data in the memory allocation with the specified tag will be
         offloaded to CPU memory, and others will be discarded.
 
         :param offload_tags: The tags of the memory allocation that will be
@@ -204,28 +205,37 @@ class CuMemAllocator:
                 data.cpu_backup_tensor = cpu_backup_tensor
             unmap_and_release(handle)
 
-    def wake_up(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """
         Wake up the allocator from sleep mode.
         All data that is previously offloaded will be loaded back to GPU 
-        memory, and the rest of the data will have empty memory."""
+        memory, and the rest of the data will have empty memory.
+        
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
         for ptr, data in self.pointer_to_data.items():
-            handle = data.handle
-            create_and_map(handle)
-            if data.cpu_backup_tensor is not None:
-                cpu_backup_tensor = data.cpu_backup_tensor
-                if cpu_backup_tensor is not None:
-                    size_in_bytes = cpu_backup_tensor.numel(
-                    ) * cpu_backup_tensor.element_size()
-                    cpu_ptr = cpu_backup_tensor.data_ptr()
-                    libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
-                    data.cpu_backup_tensor = None
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
 
     @contextmanager
     def use_memory_pool(self, tag: Optional[str] = None):
         """
         A context manager to use the memory pool.
-        All memory allocation created inside the context will be allocated 
+        All memory allocation created inside the context will be allocated
         in the memory pool, and has the specified tag.
 
         :param tag: The tag of the memory allocation. If None, the default tag
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index b920cd7e1acfc..1f4b4faf1190a 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+import os
+from typing import List, Optional
 
 import torch
 from torch.distributed import ProcessGroup
 
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
 from .base_device_communicator import DeviceCommunicatorBase
 
 
@@ -16,19 +20,120 @@ class CpuCommunicator(DeviceCommunicatorBase):
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
-        self.ipex_available = False
         self.dist_module = torch.distributed
-        try:
-            import intel_extension_for_pytorch as ipex
-            self.ipex_available = True
-            self.dist_module = ipex.distributed
-        except ImportError:
-            """
-            Intel IPEX not found. Falling back to PyTorch native 
-            all_reduce for CPU (e.g. MacOS)
-            """
-            pass
+
+        if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+            self.dist_module = _CPUSHMDistributed(self)
 
     def all_reduce(self, input_):
         self.dist_module.all_reduce(input_, group=self.device_group)
         return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        # Gather.
+        self.dist_module.gather(input_,
+                                gather_list,
+                                dst=self.ranks[dst],
+                                group=self.device_group)
+
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(output_tensor,
+                                                input_,
+                                                group=self.device_group)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+
+class _CPUSHMDistributed:
+
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        self.communicator = communicator
+
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+
+        self.handle = self._init_cpu_shm()
+
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+
+        return handle
+
+    def all_reduce(self,
+                   input: torch.Tensor,
+                   group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+
+    def gather(self,
+               input: torch.Tensor,
+               gather_list: Optional[List[torch.Tensor]],
+               dst: int = -1,
+               group: Optional[ProcessGroup] = None) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(self.handle, input, gather_list,
+                                torch.distributed.get_group_rank(group, dst))
+
+    def all_gather_into_tensor(self,
+                               output: torch.Tensor,
+                               input: torch.Tensor,
+                               group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 90f7f2d0f9823..45fc2a7118b74 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
 
@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -22,7 +20,7 @@ try:
     ops.meta_size()
     custom_ar = True
 except Exception:
-    # For AMD GPUs and CPUs
+    # For CPUs
     custom_ar = False
 
 logger = init_logger(__name__)
@@ -71,7 +69,9 @@ class CustomAllreduce:
 
         if not custom_ar:
             # disable because of missing custom allreduce library
-            # e.g. in a non-cuda environment
+            # e.g. in a non-GPU environment
+            logger.info("Custom allreduce is disabled because "
+                        "of missing custom allreduce library")
             return
 
         self.group = group
@@ -129,11 +129,10 @@ class CustomAllreduce:
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        assert current_platform.is_cuda()
-        from vllm.platforms.cuda import CudaPlatform
-        cuda_platform: CudaPlatform = current_platform
-        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
-        if world_size > 2 and not full_nvlink:
+        assert current_platform.is_cuda_alike()
+        fully_connected = current_platform.is_fully_connected(
+            physical_device_ids)
+        if world_size > 2 and not fully_connected:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
                 " more than two PCIe-only GPUs. To silence this warning, "
@@ -142,7 +141,8 @@ class CustomAllreduce:
         # test P2P capability, this checks software/cudaruntime support
         # this is expensive to compute at the first time
         # then we cache the result
-        if not _can_p2p(rank, world_size):
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
             logger.warning(
                 "Custom allreduce is disabled because your platform lacks "
                 "GPU P2P capability or P2P test failed. To silence this "
@@ -154,7 +154,8 @@ class CustomAllreduce:
         # Meta data composes of two parts: meta data for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
-                                                   group=group)
+                                                   group=group,
+                                                   uncached=True)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
         # are first copied into this buffer before allreduce is performed
         self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
@@ -169,46 +170,11 @@ class CustomAllreduce:
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
-        self.full_nvlink = full_nvlink
+        self.fully_connected = fully_connected
         self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
-                                       self.full_nvlink)
+                                       self.fully_connected)
         ops.register_buffer(self._ptr, self.buffer_ptrs)
 
-    @staticmethod
-    def create_shared_buffer(
-            size_in_bytes: int,
-            group: Optional[ProcessGroup] = None) -> List[int]:
-        """
-        Creates a shared buffer and returns a list of pointers
-        representing the buffer on all processes in the group.
-        """
-        lib = CudaRTLibrary()
-        pointer = lib.cudaMalloc(size_in_bytes)
-        handle = lib.cudaIpcGetMemHandle(pointer)
-        world_size = dist.get_world_size(group=group)
-        rank = dist.get_rank(group=group)
-        handles = [None] * world_size
-        dist.all_gather_object(handles, handle, group=group)
-
-        pointers: List[int] = []
-        for i, h in enumerate(handles):
-            if i == rank:
-                pointers.append(pointer.value)  # type: ignore
-            else:
-                pointers.append(
-                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
-
-        return pointers
-
-    @staticmethod
-    def free_shared_buffer(pointers: List[int],
-                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = None) -> None:
-        if rank is None:
-            rank = dist.get_rank(group=group)
-        lib = CudaRTLibrary()
-        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
-
     @contextmanager
     def capture(self):
         """
@@ -255,7 +221,7 @@ class CustomAllreduce:
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
+        if self.world_size == 2 or self.fully_connected:
             return inp_size < self.max_size
         return False
 
@@ -306,3 +272,30 @@ class CustomAllreduce:
 
     def __del__(self):
         self.close()
+
+    @staticmethod
+    def create_shared_buffer(size_in_bytes: int,
+                             group: Optional[ProcessGroup] = None,
+                             uncached: Optional[bool] = False) -> List[int]:
+        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
+
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer)  # type: ignore
+            else:
+                pointers.append(ops.open_mem_handle(h))
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: List[int],
+                           group: Optional[ProcessGroup] = None,
+                           rank: Optional[int] = 0) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        ops.free_shared_buffer(pointers[rank])
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 0d54fc73c882b..49a65bd0d1182 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -7,11 +7,13 @@ import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple, Union
+from threading import Event
+from typing import Any, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
+import zmq
 from torch.distributed import ProcessGroup
 from zmq import IPV6  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
@@ -125,8 +127,13 @@ class ShmRingBuffer:
                        lambda *args, **kwargs: None):
                 try:
                     self.shared_memory = shared_memory.SharedMemory(name=name)
-                    assert (
-                        self.shared_memory.size == self.total_bytes_of_buffer)
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert (self.shared_memory.size
+                            >= self.total_bytes_of_buffer)
                 except FileNotFoundError:
                     # we might deserialize the object in a different node
                     # in this case, this object is not used,
@@ -395,7 +402,9 @@ class MessageQueue:
                 break
 
     @contextmanager
-    def acquire_read(self, timeout: Optional[float] = None):
+    def acquire_read(self,
+                     timeout: Optional[float] = None,
+                     cancel: Optional[Event] = None):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -425,6 +434,9 @@ class MessageQueue:
                         )
                         n_warning += 1
 
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
                     # if we time out, raise an exception
                     if (timeout is not None
                             and time.monotonic() - start_time > timeout):
@@ -459,10 +471,12 @@ class MessageQueue:
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self, timeout: Optional[float] = None):
+    def dequeue(self,
+                timeout: Optional[float] = None,
+                cancel: Optional[Event] = None):
         """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read(timeout) as buf:
+            with self.acquire_read(timeout, cancel) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object
@@ -470,15 +484,21 @@ class MessageQueue:
                     # see https://docs.python.org/3/library/pickle.html
                     obj = pickle.loads(buf[1:])
             if overflow:
-                recv = self.local_socket.recv()
-                obj = pickle.loads(recv)
+                obj = MessageQueue.recv(self.local_socket, timeout)
         elif self._is_remote_reader:
-            recv = self.remote_socket.recv()
-            obj = pickle.loads(recv)
+            obj = MessageQueue.recv(self.remote_socket, timeout)
         else:
             raise RuntimeError("Only readers can dequeue")
         return obj
 
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv = socket.recv(copy=False)
+        return pickle.loads(recv.buffer)
+
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 05cb1e0f6ef5a..de66ceaeef6f1 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -22,6 +22,8 @@ if current_platform.is_tpu():
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
+    from torch_xla.distributed.xla_multiprocessing import (
+        create_optimized_replica_groups)
 
     if USE_RAY:
         from vllm.executor import ray_utils
@@ -79,9 +81,12 @@ class TpuCommunicator(DeviceCommunicatorBase):
 
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, input_)
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index e69de29bb2d1d..ec07c6fe0d12d 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    ensure_kv_transfer_initialized, get_kv_transfer_group,
+    has_kv_transfer_group, is_v1_kv_transfer_group)
+
+__all__ = [
+    "get_kv_transfer_group", "has_kv_transfer_group",
+    "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
+    "KVConnectorBaseType"
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 57c764b481c29..0d1a3d40af413 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, List, Tuple, Union
 
 import torch
 
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -121,3 +122,6 @@ class KVConnectorBase(ABC):
         """
 
         raise NotImplementedError
+
+
+KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 7336c54ec8a30..665ea2f5ba011 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -3,14 +3,22 @@
 import importlib
 from typing import TYPE_CHECKING, Callable, Dict, Type
 
+import vllm.envs as envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.logger import init_logger
+
 from .base import KVConnectorBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+logger = init_logger(__name__)
+
 
 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
+    _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -19,22 +27,51 @@ class KVConnectorFactory:
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> Type[KVConnectorBase]:
+        def loader() -> Type[KVConnectorBaseType]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector(cls, rank: int, local_rank: int,
-                         config: "VllmConfig") -> KVConnectorBase:
+    def create_connector_v0(cls, rank: int, local_rank: int,
+                            config: "VllmConfig") -> KVConnectorBase:
+        if envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V0 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
         connector_name = config.kv_transfer_config.kv_connector
         if connector_name not in cls._registry:
             raise ValueError(f"Unsupported connector type: {connector_name}")
 
         connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase)
         return connector_cls(rank, local_rank, config)
 
+    @classmethod
+    def create_connector_v1(
+        cls,
+        config: "VllmConfig",
+        role: KVConnectorRole,
+    ) -> KVConnectorBase_V1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V1 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
+        connector_name = config.kv_transfer_config.kv_connector
+        connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase_V1)
+        logger.info("Creating v1 connector with name: %s", connector_name)
+        # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        # - Should only be used inside the forward context & attention layer
+        # We build separately to enforce strict separation
+        return connector_cls(config, role)
+
 
 # Register various connectors here.
 # The registration should not be done in each individual file, as we want to
@@ -53,3 +90,13 @@ KVConnectorFactory.register_connector(
     "LMCacheConnector",
     "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
     "LMCacheConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeStoreConnector",
+    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
new file mode 100644
index 0000000000000..7b26aec23239c
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+MooncakeStore Connector for Distributed Machine Learning Inference
+The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
+(KV cache producer) and decode vLLM workers (KV cache consumer) using a
+database-style KVStore.
+"""
+import hashlib
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class MooncakeStoreConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+        self.config = config.kv_transfer_config
+        self.kv_helper = kv_helper(config)
+        self.local_tp_rank = local_rank
+
+        # Init kv_store
+        if self.config.kv_connector == "MooncakeStoreConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_store:
+                raise ValueError(
+                    "To use MooncakeStoreConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
+                    MooncakeStore)
+                logger.info(
+                    "Initializing KVStoreConnector under kv_transfer_config %s",
+                    self.config)
+                self.kv_store = MooncakeStore(config)
+        else:
+            logger.error("Can not find %s", self.config.kv_connector)
+
+        assert self.kv_store is not None
+
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        self.kv_store.close()
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            store_key_prefix = self.tensor_hash(current_tokens)
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            kvcache_to_sent = torch.stack((keys, values), dim=0)
+            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
+            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
+
+            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
+            self.kv_store.put(hidden_key,
+                              hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        bypass_model_exec = True
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        hidden_or_intermediate_states_for_one_req = []
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            # get roi for current seq
+            load_key_prefix = self.tensor_hash(current_tokens)
+            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
+            remote_kv = self.kv_store.get(load_kvcache_key)
+            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
+            hidden = self.kv_store.get(hidden_key)
+
+            if remote_kv is None or hidden is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                continue
+
+            num_computed_tokens = current_tokens.shape[0]
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # call self.kv_store to get kv layer by layer
+            for layer_id in range(start_layer, end_layer):
+                layer = model_executable.model.layers[layer_id]
+                # get kvcache object
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                # get remote kvcache
+                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
+                    layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    @staticmethod
+    def tensor_hash(tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+        hash_object = hashlib.blake2b(tensor_bytes)
+        hash_hex = hash_object.hexdigest()
+        return int(hash_hex[:16], 16)
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 49b97d7b58897..0464a7585138f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -12,10 +12,10 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
 
-import vllm.envs as envs
-from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
 from vllm.logger import init_logger
@@ -37,9 +37,7 @@ class SimpleConnector(KVConnectorBase):
     ):
 
         self.config = config.kv_transfer_config
-        self.tp_size = config.parallel_config.tensor_parallel_size
-        self.is_deepseek_mla = config.model_config.is_deepseek_mla
-        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.kv_helper = kv_helper(config)
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -165,31 +163,7 @@ class SimpleConnector(KVConnectorBase):
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
-
-        model_config = model_executable.model.config
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-
-        # Deepseek's MLA (Multi-head Latent Attention) uses two different
-        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
-        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
-        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
-        # kv_lora_rank + qk_rope_head_dim].
-        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
-        # to a kv_cache shape of [2, num_blks, blk_size,
-        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
-        # For more details, see vllm/attention/backends/mla/common.py.
-        if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + \
-                model_config.qk_rope_head_dim
-            num_heads = 1
-        elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + \
-                model_config.qk_rope_head_dim
-        else:
-            head_size = getattr(model_config, "head_dim",
-                                int(hidden_size // num_attention_heads))
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -212,13 +186,8 @@ class SimpleConnector(KVConnectorBase):
 
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
-
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    key_cache = kv_cache.reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache.reshape(-1, num_heads, head_size)
-                else:
-                    key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
 
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
@@ -248,12 +217,12 @@ class SimpleConnector(KVConnectorBase):
         # and hidden states.
         bypass_model_exec = True
 
-        model_config = model_executable.model.config
-
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
 
         hidden_or_intermediate_states_for_one_req = []
 
@@ -312,41 +281,19 @@ class SimpleConnector(KVConnectorBase):
             end_pos = start_pos + num_computed_tokens
 
             # put received KV caches into paged memory
-            for i in range(model_executable.model.start_layer,
-                           model_executable.model.end_layer):
+            for cur_layer in range(start_layer, end_layer):
 
-                kv_cache = kv_caches[i - model_executable.model.start_layer]
-                layer = model_executable.model.layers[i]
+                layer_id = cur_layer - start_layer
+                kv_cache = kv_caches[layer_id]
+                layer = model_executable.model.layers[cur_layer]
 
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    layer.self_attn.attn = layer.self_attn.mla_attn
-                    k_c_normed_k_pe = keys[
-                        i - model_executable.model.start_layer].to(
-                            kv_cache.device).squeeze(1)
-                    k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
-                    k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
-                    ops.concat_and_cache_mla(
-                        k_c_normed,
-                        k_pe,
-                        kv_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                    )
-                else:
-                    key_cache, value_cache = kv_cache[0], kv_cache[1]
-                    ops.reshape_and_cache_flash(
-                        keys[i - model_executable.model.start_layer].to(
-                            key_cache.device),
-                        values[i - model_executable.model.start_layer].to(
-                            value_cache.device),
-                        key_cache,
-                        value_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                        layer.self_attn.attn._v_scale,
-                    )
+                # get remote kvcache
+                remote_k, remote_v = keys[layer_id], values[layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
new file mode 100644
index 0000000000000..0b0ce9828a74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KV cache helper for store.
+"""
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class model_aware_kv_ops_helper:
+
+    def __init__(self, config: VllmConfig):
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+    def get_model_args(self, model_executable: torch.nn.Module):
+
+        model_config = model_executable.model.config
+        self.model_executable = model_executable
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim",
+                                int(hidden_size // num_attention_heads))
+
+        return num_heads, head_size
+
+    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
+        if self.is_deepseek_mla and self.use_mla_opt:
+            key_cache = kv_cache.reshape(-1, num_heads, head_size)
+            value_cache = kv_cache.reshape(-1, num_heads, head_size)
+        else:
+            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+        return key_cache, value_cache
+
+    def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
+                        layer, kv_cache, slot_mapping, start_pos, end_pos):
+
+        model_config = model_executable.model.config
+
+        if self.is_deepseek_mla and self.use_mla_opt:
+            layer.self_attn.attn = layer.self_attn.mla_attn
+            k_c_normed_k_pe = keys.squeeze(1)
+            k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+            ops.concat_and_cache_mla(
+                k_c_normed.to(kv_cache.device),
+                k_pe.to(kv_cache.device),
+                kv_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+            )
+        else:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            ops.reshape_and_cache_flash(
+                keys.to(key_cache.device),
+                values.to(value_cache.device),
+                key_cache,
+                value_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+                layer.self_attn.attn._v_scale,
+            )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
new file mode 100644
index 0000000000000..a017b140e0902
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
new file mode 100644
index 0000000000000..95967d2ca9193
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
+communication in vLLM v1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save KV cache.
+        get_num_new_matched_tokens() - get number of new tokens 
+            that exist in the remote KV cache
+        update_state_after_alloc() - update KVConnector state after
+            temporary buffer alloc by the CacheManager.
+
+    Worker-side: runs in each worker, loads/saves KV cache to/from
+    the Connector based on the metadata.
+        start_load_kv() - starts loading all KVs (maybe async)
+        wait_for_layer_load() - blocks until layer i load is done
+
+        save_kv_layer() - starts saving KV for layer i (maybe async)
+        wait_for_save() - blocks until all saves are done
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class KVConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+@dataclass
+class KVConnectorMetadata:
+    pass
+
+
+class KVConnectorBase_V1(ABC):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        logger.warning(
+            "Initializing KVConnectorBase_V1. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self._connector_metadata = KVConnectorMetadata()
+        self._vllm_config = vllm_config
+        self._role = role
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time 
+        before the model execution. The metadata will be used for runtime
+        KV cache loading and saving.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time 
+        after the model execution.
+        """
+        self._connector_metadata = KVConnectorMetadata()
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+        return self._connector_metadata
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    @abstractmethod
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        pass
+
+    @abstractmethod
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        pass
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    @abstractmethod
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
new file mode 100644
index 0000000000000..1d2040784e6cb
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -0,0 +1,382 @@
+# SPDX-License-Identifier: Apache-2.0
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ReqMeta:
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+    # Is store or load
+    is_store: bool
+
+    @staticmethod
+    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
+                  is_store: bool) -> "ReqMeta":
+        valid_num_tokens = align_to_block_size(len(token_ids), block_size)
+        token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+        return ReqMeta(
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+            is_store=is_store,
+        )
+
+
+@dataclass
+class SharedStorageConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+
+    def __init__(self):
+        self.requests = []
+
+    def add_request(
+        self,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+
+
+class SharedStorageConnector(KVConnectorBase_V1):
+    # NOTE: This is Simple debug implementation of the KV connector.
+    # It save / load the KV cache to / from the disk.
+    # It does extra work which will overwrite the existing prefix-cache in GPU
+    # - to remove the overhead, need to add some "mask" in the ReqMeta class
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Request] = {}
+        transfer_config = vllm_config.kv_transfer_config
+        self._storage_path = transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp")
+        logger.info(vllm_config.kv_transfer_config)
+        logger.info("Shared storage path is %s", self._storage_path)
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's 
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+        """
+        attn_metadata = forward_context.attn_metadata
+
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> None:
+            """Inject the KV cache into the layer.
+
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache 
+                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape 
+                    [num_tokens].
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+
+        # Get the metadata
+        metadata: KVConnectorMetadata = \
+            self._get_connector_metadata()
+        assert isinstance(metadata, SharedStorageConnectorMetadata)
+
+        if metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the connector metadata is None"
+            )
+            return
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the attn_metadata is None")
+            return
+
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            if request.is_store:
+                continue
+            logger.info("Inject KV cache of %d tokens to the paged memory",
+                        len(request.slot_mapping))
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[\
+                        forward_context.virtual_engine]
+
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = safetensors.torch.load_file(
+                    filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer. 
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer 
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        for request in connector_metadata.requests:
+            if request.is_store:
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = extract_kv_from_layer(kv_layer,
+                                                 request.slot_mapping)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
+
+    def wait_for_save(self):
+        return
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+
+        # NOTE: in this debug implementation, we assume that the prompt is
+        # cached_prompt + newly_generated_single_token
+        # Therefore, we use prompt_token_ids[:-1] to determine the folder name
+
+        # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
+        # with the block granularity. And it expects the returned blocks and
+        # num_computed_tokens to also be aligned with the block granularity.
+        if not self._found_match_for_request(request):
+            return 0
+
+        logger.info("External Cache Hit!")
+
+        # Now, first num_tokens_to_check tokens are hit, we need to prepare
+        # the metadata for the worker connector to correctly load the KV
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+
+        return num_tokens_to_check - num_computed_tokens
+
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        if num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = request
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = SharedStorageConnectorMetadata()
+
+        total_need_load = 0
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+            else:
+                # NOTE: here, we set the store and load being exclusive,
+                # but a single request can have both store and load.
+                # NOTE(rob): for this debug implementation, we only cache
+                # the original prompt tokens.
+                if not self._found_match_for_request(new_req):
+                    meta.add_request(token_ids=new_req.prompt_token_ids,
+                                     block_ids=new_req.block_ids,
+                                     block_size=self._block_size,
+                                     is_store=True)
+
+        for cached_req in scheduler_output.scheduled_cached_reqs:
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not cached_req.resumed_from_preemption:
+                break
+            if cached_req.req_id in self._requests_need_load:
+                # NOTE(rob): cached_req_data does not have the full
+                # list of token ids (only new tokens). So we look it
+                # up in the actual request object.
+                request = self._requests_need_load[cached_req.req_id]
+                total_tokens = (len(cached_req.new_token_ids) +
+                                cached_req.num_computed_tokens)
+                token_ids = request.all_token_ids[:total_tokens]
+
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = cached_req.new_block_ids
+
+                meta.add_request(token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+
+        assert total_need_load == len(self._requests_need_load)
+        self._requests_need_load.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_request(
+        self,
+        request: "Request",
+    ) -> bool:
+        """Check if the cache is hit for the request.
+        """
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        foldername = self._generate_foldername_debug(torch.tensor(
+            request.prompt_token_ids)[:num_tokens_to_check],
+                                                     create_folder=False)
+        return os.path.exists(foldername)
+
+    def _generate_foldername_debug(
+        self,
+        input_ids: torch.Tensor,
+        create_folder=False,
+    ) -> str:
+        """Generate a folder name based on the hash of the bytes of the input 
+        ids.
+        """
+        input_ids_bytes = input_ids.numpy().tobytes()
+        input_ids_hash = hashlib.md5(input_ids_bytes).hexdigest()
+        foldername = os.path.join(self._storage_path, input_ids_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(
+        self,
+        layer_name: str,
+        input_ids: torch.Tensor,
+    ) -> str:
+        """Generate a file name based on the layer name and the hash 
+        of the bytes of the input ids.
+        """
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
+        return os.path.join(foldername, f"{layer_name}.safetensors")
+
+
+def align_to_block_size(num_tokens: int, block_size) -> int:
+    """Align the number of tokens to the block size.
+    """
+    return (num_tokens - 1) // block_size * block_size
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
similarity index 97%
rename from vllm/distributed/kv_transfer/kv_transfer_agent.py
rename to vllm/distributed/kv_transfer/kv_connector_agent.py
index 1e80e0bd7de86..9d7145098105e 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -46,7 +46,7 @@ class KVTransferAgent:
         assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
             "TransferAgent should only be used when kv_connector is set."
 
-        self.connector = KVConnectorFactory.create_connector(
+        self.connector = KVConnectorFactory.create_connector_v0(
             rank, local_rank, config)
 
     def send_kv_caches_and_hidden_states(
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 845da7c501e88..bea42846e9e41 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This file contains a new class `KVLookupBufferBase` that allows developers to 
-think of KV cache operations as inserting new KV cache entries (`insert`) 
-into the lookup buffer and querying existing KV caches (`drop_select`) 
+This file contains a new class `KVLookupBufferBase` that allows developers to
+think of KV cache operations as inserting new KV cache entries (`insert`)
+into the lookup buffer and querying existing KV caches (`drop_select`)
 from the lookup buffer.
 
-All distributed communications are abstracted behind this class.
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+
+These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 
 from abc import ABC, abstractmethod
@@ -14,9 +18,27 @@ from typing import List, Optional
 import torch
 
 
-class KVLookupBufferBase(ABC):
+class KVCacheBufferBase(ABC):
     """
-    Abstract base class for a lookup buffer.
+    Abstract base class for a KVCache buffer.
+    """
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
 
     This class provides an abstraction for a key-value (KV) cache lookup buffer.
     
@@ -96,12 +118,55 @@ class KVLookupBufferBase(ABC):
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
 
-        This method is responsible for cleaning up resources related to the 
-        lookup buffer when it is no longer needed.
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+            value (Optional[torch.Tensor]): Tensor to be stored.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+        Returns:
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
 
         Raises:
             NotImplementedError: This method must be implemented in subclasses.
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
new file mode 100644
index 0000000000000..5bb7110216768
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file contains a new class `MooncakeStore` that allows developers to
+think of KV cache transfer operations as putting new KV cache entries
+into a remote KVStore-based lookup buffer and getting existing KV caches
+from this remote lookup buffer.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVStoreBufferBase)
+from vllm.logger import init_logger
+
+DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get("global_segment_size",
+                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
+            local_buffer_size=config.get("local_buffer_size",
+                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+            master_server_address=config.get("master_server_address"),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeStoreConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeStoreConfig.from_file(config_file_path)
+
+
+class MooncakeStore(KVStoreBufferBase):
+
+    def __init__(
+        self,
+        config: VllmConfig,
+    ):
+
+        try:
+            from mooncake.store import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        try:
+            self.store = MooncakeDistributedStore()
+            self.config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+
+            self.store.setup(self.config.local_hostname,
+                             self.config.metadata_server,
+                             self.config.global_segment_size,
+                             self.config.local_buffer_size,
+                             self.config.protocol, self.config.device_name,
+                             self.config.master_server_address)
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        # A message queue needs to be introduced before making it asynchronous.
+        if value is not None:
+            self._put_impl(key, value)
+
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        # A message queue needs to be introduced before making it asynchronous.
+        value = self._get_impl(key)
+        return value
+
+    def _put_impl(
+        self,
+        key: str,
+        value: torch.Tensor,
+    ) -> None:
+        """Put KVCache to Mooncake Store"""
+        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_tensor = torch.tensor(device_id, dtype=torch.int32)
+        value_bytes = safetensors_save({
+            "tensor": value,
+            "device_id": device_tensor
+        })
+        try:
+            self.store.put(key, value_bytes)
+        except TypeError as err:
+            logger.error("Failed to put value into Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Put Type Error.") from err
+
+    def _get_impl(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Get KVCache from Mooncake Store"""
+        try:
+            data = self.store.get(key)
+        except TypeError as err:
+            logger.error("Failed to get value from Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Get Type Error.") from err
+
+        if data:
+            loaded_tensors = safetensors_load(data)
+            tensor = loaded_tensors["tensor"]
+            device_id_tensor = loaded_tensors["device_id"]
+            device_id = int(device_id_tensor.item())
+            device = torch.device(
+                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            return tensor.to(device)
+
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index ec46d40454472..466700264915c 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -57,14 +57,14 @@ class MooncakeTransferEngine:
 
     def __init__(self, kv_rank: int, local_rank: int):
         try:
-            import mooncake_vllm_adaptor as mva
+            from mooncake.engine import TransferEngine
         except ImportError as e:
             raise ImportError(
                 "Please install mooncake by following the instructions at "
                 "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
                 "to run vLLM with MooncakeConnector.") from e
 
-        self.engine = mva.mooncake_vllm_adaptor()
+        self.engine = TransferEngine()
         self.local_rank = local_rank
 
         try:
@@ -140,12 +140,12 @@ class MooncakeTransferEngine:
                     "Mooncake Configuration error. `metadata_backend`"
                     f" should be one of {supported_backend}.")
 
-            self.engine.initializeExt(local_hostname, metadata_server,
-                                      protocol, device_name, metadata_backend)
+            self.engine.initialize_ext(local_hostname, metadata_server,
+                                       protocol, device_name, metadata_backend)
 
     def allocate_managed_buffer(self, length: int) -> int:
         """Allocate a managed buffer of the specified length."""
-        ret = self.engine.allocateManagedBuffer(length)
+        ret = self.engine.allocate_managed_buffer(length)
         if ret <= 0:
             logger.error("Allocation Return Error")
             raise Exception("Allocation Return Error")
@@ -153,13 +153,13 @@ class MooncakeTransferEngine:
 
     def free_managed_buffer(self, buffer: int, length: int) -> int:
         """Free a previously allocated managed buffer."""
-        return self.engine.freeManagedBuffer(buffer, length)
+        return self.engine.free_managed_buffer(buffer, length)
 
     def transfer_sync(self, buffer: int, peer_buffer_address: int,
                       length: int) -> int:
         """Synchronously transfer data to the specified address."""
-        ret = self.engine.transferSync(self.remote_url, buffer,
-                                       peer_buffer_address, length)
+        ret = self.engine.transfer_sync_read(self.remote_url, buffer,
+                                             peer_buffer_address, length)
         if ret < 0:
             logger.error("Transfer Return Error")
             raise Exception("Transfer Return Error")
@@ -168,11 +168,11 @@ class MooncakeTransferEngine:
     def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
                               length: int) -> int:
         """Write bytes to the allocated buffer."""
-        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
 
     def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
         """Read bytes from the allocated buffer."""
-        return self.engine.readBytesFromBuffer(buffer, length)
+        return self.engine.read_bytes_from_buffer(buffer, length)
 
     def wait_for_ack(self, src_ptr: int, length: int) -> None:
         """Asynchronously wait for ACK from the receiver."""
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
new file mode 100644
index 0000000000000..25d2f2cf5c6e6
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.distributed.parallel_state import get_world_group
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+
+
+def get_kv_transfer_group() -> KVConnectorBaseType:
+    assert _KV_CONNECTOR_AGENT is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_CONNECTOR_AGENT
+
+
+def has_kv_transfer_group() -> bool:
+    return _KV_CONNECTOR_AGENT is not None
+
+
+def is_v1_kv_transfer_group(
+        connector: Optional[KVConnectorBaseType] = None) -> bool:
+    """Check if the KV connector is the v1 connector.
+    If the argument is None, it will check the global KV connector
+
+    Args:
+        connector: The KV connector to check. If None, it will check the
+            global KV connector.
+
+    Note:
+        This function will no-longer be needed after the v1 KV connector
+        becomes the default.
+    """
+    if connector is None:
+        connector = _KV_CONNECTOR_AGENT
+
+    if connector is None:
+        return False
+
+    return isinstance(connector, KVConnectorBase_V1)
+
+
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_CONNECTOR_AGENT
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if (vllm_config.kv_transfer_config.is_kv_transfer_instance
+            and _KV_CONNECTOR_AGENT is None):
+        if envs.VLLM_USE_V1:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1(
+                config=vllm_config, role=KVConnectorRole.WORKER)
+        else:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0(
+                rank=get_world_group().rank,
+                local_rank=get_world_group().local_rank,
+                config=vllm_config,
+            )
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 514851694837d..d0ac7e92f9c89 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -29,15 +29,13 @@ from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
-import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase)
@@ -46,9 +44,6 @@ from vllm.logger import init_logger
 from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
                         supports_custom_op)
 
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
 
 @dataclass
 class GraphCaptureContext:
@@ -119,11 +114,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
 
 
 if supports_custom_op():
+    from vllm.platforms import current_platform
     direct_register_custom_op(
         op_name="all_reduce",
         op_func=all_reduce,
         mutates_args=[],
         fake_impl=all_reduce_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
 
 
@@ -192,9 +189,11 @@ class GroupCoordinator:
 
         from vllm.platforms import current_platform
 
-        # TODO: fix it for other platforms
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
+        elif current_platform.is_out_of_tree():
+            self.device = torch.device(
+                f"{current_platform.device_name}:{local_rank}")
         else:
             self.device = torch.device("cpu")
 
@@ -219,7 +218,8 @@ class GroupCoordinator:
                 self.cpu_group, 1 << 22, 6)
 
         from vllm.platforms import current_platform
-        self.use_custom_op_call = current_platform.is_cuda_alike()
+        self.use_custom_op_call = (current_platform.is_cuda_alike()
+                                   or current_platform.is_tpu())
 
     @property
     def first_rank(self):
@@ -767,14 +767,6 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
-_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
-
-
-def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
-    assert _KV_TRANSFER is not None, (
-        "disaggregated KV cache transfer parallel group is not initialized")
-    return _KV_TRANSFER
-
 
 @contextmanager
 def graph_capture(device: torch.device):
@@ -957,26 +949,6 @@ def initialize_model_parallel(
         _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group)
 
 
-def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
-    """
-    Initialize KV cache transfer parallel group.
-    """
-
-    global _KV_TRANSFER
-
-    if vllm_config.kv_transfer_config is None:
-        return
-
-    if all([
-            vllm_config.kv_transfer_config.is_kv_transfer_instance,
-            _KV_TRANSFER is None
-    ]):
-        _KV_TRANSFER = kv_transfer.KVTransferAgent(
-            rank=get_world_group().rank,
-            local_rank=get_world_group().local_rank,
-            config=vllm_config)
-
-
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 84899358a6d66..2cb57afd45664 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,6 +15,7 @@ import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
+                                                _unregister_process_group,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
 
@@ -101,10 +102,11 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
         if remaining_layers := num_hidden_layers % pp_size:
             for i in range(2, remaining_layers + 2):
                 partitions[-i] += 1
-            logger.info("Hidden layers were unevenly partitioned: %s",
-                        ",".join(str(p) for p in partitions))
-            logger.info("This can be manually overridden using the "
-                        "VLLM_PP_LAYER_PARTITION environment variable")
+            logger.info(
+                "Hidden layers were unevenly partitioned: [%s]. "
+                "This can be manually overridden using the "
+                "VLLM_PP_LAYER_PARTITION environment variable",
+                ",".join(str(p) for p in partitions))
 
     start_layer = sum(partitions[:pp_rank])
     end_layer = start_layer + partitions[pp_rank]
@@ -206,10 +208,7 @@ class StatelessProcessGroup:
     def barrier(self):
         """A barrier to synchronize all ranks."""
         for i in range(self.world_size):
-            if i == self.rank:
-                self.broadcast_obj(None, src=self.rank)
-            else:
-                self.broadcast_obj(None, src=i)
+            self.broadcast_obj(None, src=i)
 
     @staticmethod
     def create(
@@ -333,3 +332,15 @@ def stateless_init_torch_distributed_process_group(
     pg._register_backend(device, backend_type, backend_class)
 
     return pg
+
+
+def stateless_destroy_torch_distributed_process_group(
+        pg: ProcessGroup) -> None:
+    """
+    Destroy ProcessGroup returned by
+        stateless_init_torch_distributed_process_group().
+    """
+    # Lazy import for non-CUDA backends.
+    from torch.distributed.distributed_c10d import _shutdown_backend
+    _shutdown_backend(pg)
+    _unregister_process_group(pg.group_name)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 65a1676c0637d..b6d0bfeac4a44 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,32 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 
+# yapf: disable
 import argparse
 import dataclasses
 import json
+import re
 import threading
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
-                    Tuple, Type, Union, cast, get_args)
+from dataclasses import MISSING, dataclass, fields
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal,
+                    Optional, Tuple, Type, TypeVar, Union, cast, get_args,
+                    get_origin)
 
 import torch
+from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm import version
-from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
-                         DecodingConfig, DeviceConfig, HfOverrides,
+from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
+                         ConfigFormat, ConfigType, DecodingConfig, Device,
+                         DeviceConfig, DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ModelImpl, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
-                         SchedulerConfig, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig)
+                         ModelConfig, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PoolType, PrefixCachingHashAlgo, PromptAdapterConfig,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerPoolConfig, VllmConfig,
+                         get_attr_docs, get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor
+from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
+
+# yapf: enable
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -35,25 +45,39 @@ logger = init_logger(__name__)
 
 ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
 
-DEVICE_OPTIONS = [
-    "auto",
-    "cuda",
-    "neuron",
-    "cpu",
-    "tpu",
-    "xpu",
-    "hpu",
-]
+# object is used to allow for special typing forms
+T = TypeVar("T")
+TypeHint = Union[type[Any], object]
+TypeHintT = Union[type[T], object]
 
 
-def nullable_str(val: str):
-    if not val or val == "None":
+def optional_arg(val: str, return_type: Callable[[str], T]) -> Optional[T]:
+    if val == "" or val == "None":
         return None
-    return val
+    try:
+        return return_type(val)
+    except ValueError as e:
+        raise argparse.ArgumentTypeError(
+            f"Value {val} cannot be converted to {return_type}.") from e
 
 
-def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
-    """Parses a string containing comma separate key [str] to value [int]
+def optional_str(val: str) -> Optional[str]:
+    return optional_arg(val, str)
+
+
+def optional_int(val: str) -> Optional[int]:
+    return optional_arg(val, int)
+
+
+def optional_float(val: str) -> Optional[float]:
+    return optional_arg(val, float)
+
+
+def nullable_kvs(val: str) -> Optional[dict[str, int]]:
+    """NOTE: This function is deprecated, args should be passed as JSON
+    strings instead.
+    
+    Parses a string containing comma separate key [str] to value [int]
     pairs into a dictionary.
 
     Args:
@@ -87,6 +111,17 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     return out_dict
 
 
+def optional_dict(val: str) -> Optional[dict[str, int]]:
+    if re.match("^{.*}$", val):
+        return optional_arg(val, json.loads)
+
+    logger.warning(
+        "Failed to parse JSON string. Attempting to parse as "
+        "comma-separated key=value pairs. This will be deprecated in a "
+        "future release.")
+    return nullable_kvs(val)
+
+
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
@@ -99,55 +134,66 @@ class EngineArgs:
     tokenizer_mode: str = 'auto'
     trust_remote_code: bool = False
     allowed_local_media_path: str = ""
-    download_dir: Optional[str] = None
-    load_format: str = 'auto'
+    download_dir: Optional[str] = LoadConfig.download_dir
+    load_format: str = LoadConfig.load_format
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
-    kv_cache_dtype: str = 'auto'
+    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = None
     max_model_len: Optional[int] = None
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    distributed_executor_backend: Optional[Union[str,
-                                                 Type[ExecutorBase]]] = None
+    distributed_executor_backend: Optional[Union[
+        DistributedExecutorBackend,
+        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
-    pipeline_parallel_size: int = 1
-    tensor_parallel_size: int = 1
-    enable_expert_parallel: bool = False
-    max_parallel_loading_workers: Optional[int] = None
-    block_size: Optional[int] = None
-    enable_prefix_caching: Optional[bool] = None
+    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    data_parallel_size: int = ParallelConfig.data_parallel_size
+    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    max_parallel_loading_workers: Optional[
+        int] = ParallelConfig.max_parallel_loading_workers
+    block_size: Optional[BlockSize] = CacheConfig.block_size
+    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
+        CacheConfig.prefix_caching_hash_algo
     disable_sliding_window: bool = False
     disable_cascade_attn: bool = False
     use_v2_block_manager: bool = True
-    swap_space: float = 4  # GiB
-    cpu_offload_gb: float = 0  # GiB
-    gpu_memory_utilization: float = 0.90
-    max_num_batched_tokens: Optional[int] = None
-    max_num_partial_prefills: Optional[int] = 1
-    max_long_partial_prefills: Optional[int] = 1
-    long_prefill_token_threshold: Optional[int] = 0
-    max_num_seqs: Optional[int] = None
+    swap_space: float = CacheConfig.swap_space
+    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
+    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
+    max_num_batched_tokens: Optional[
+        int] = SchedulerConfig.max_num_batched_tokens
+    max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
+    max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
+    long_prefill_token_threshold: int = \
+        SchedulerConfig.long_prefill_token_threshold
+    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
     code_revision: Optional[str] = None
     rope_scaling: Optional[Dict[str, Any]] = None
     rope_theta: Optional[float] = None
+    hf_token: Optional[Union[bool, str]] = None
     hf_overrides: Optional[HfOverrides] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: Optional[bool] = None
     max_seq_len_to_capture: int = 8192
-    disable_custom_all_reduce: bool = False
-    tokenizer_pool_size: int = 0
+    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
+    tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
     # Note: Specifying a tokenizer pool by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
-    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
-    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
+    tokenizer_pool_type: Union[PoolType, Type["BaseTokenizerGroup"]] = \
+        TokenizerPoolConfig.pool_type
+    tokenizer_pool_extra_config: dict[str, Any] = \
+        get_field(TokenizerPoolConfig, "extra_config")
+    limit_mm_per_prompt: dict[str, int] = \
+        get_field(MultiModalConfig, "limit_per_prompt")
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     disable_mm_preprocessor_cache: bool = False
     enable_lora: bool = False
@@ -162,52 +208,42 @@ class EngineArgs:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
     max_cpu_loras: Optional[int] = None
-    device: str = 'auto'
-    num_scheduler_steps: int = 1
-    multi_step_stream_outputs: bool = True
-    ray_workers_use_nsight: bool = False
-    num_gpu_blocks_override: Optional[int] = None
-    num_lookahead_slots: int = 0
-    model_loader_extra_config: Optional[dict] = None
-    ignore_patterns: Optional[Union[str, List[str]]] = None
-    preemption_mode: Optional[str] = None
+    device: Device = DeviceConfig.device
+    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
+    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
+    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
+    num_gpu_blocks_override: Optional[
+        int] = CacheConfig.num_gpu_blocks_override
+    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
+    model_loader_extra_config: dict = \
+        get_field(LoadConfig, "model_loader_extra_config")
+    ignore_patterns: Optional[Union[str,
+                                    List[str]]] = LoadConfig.ignore_patterns
+    preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
 
-    scheduler_delay_factor: float = 0.0
-    enable_chunked_prefill: Optional[bool] = None
+    scheduler_delay_factor: float = SchedulerConfig.delay_factor
+    enable_chunked_prefill: Optional[
+        bool] = SchedulerConfig.enable_chunked_prefill
+    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
     logits_processor_pattern: Optional[str] = None
 
-    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
-
-    # TODO(Shangming): Deprecate these out-of-date params after next release
-    speculative_model: Optional[str] = None
-    speculative_model_quantization: Optional[str] = None
-    speculative_draft_tensor_parallel_size: Optional[int] = None
-    num_speculative_tokens: Optional[int] = None
-    speculative_disable_mqa_scorer: Optional[bool] = False
-    speculative_max_model_len: Optional[int] = None
-    speculative_disable_by_batch_size: Optional[int] = None
-    ngram_prompt_lookup_max: Optional[int] = None
-    ngram_prompt_lookup_min: Optional[int] = None
-    spec_decoding_acceptance_method: str = 'rejection_sampler'
-    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
-    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    disable_logprobs_during_spec_decoding: Optional[bool] = None
+    speculative_config: Optional[Dict[str, Any]] = None
 
     qlora_adapter_name_or_path: Optional[str] = None
     show_hidden_metrics_for_version: Optional[str] = None
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
-    scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
-    scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler"
+    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
+    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
 
     override_neuron_config: Optional[Dict[str, Any]] = None
     override_pooler_config: Optional[PoolerConfig] = None
     compilation_config: Optional[CompilationConfig] = None
-    worker_cls: str = "auto"
-    worker_extension_cls: str = ""
+    worker_cls: str = ParallelConfig.worker_cls
+    worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
@@ -216,12 +252,12 @@ class EngineArgs:
     enable_sleep_mode: bool = False
     model_impl: str = "auto"
 
-    calculate_kv_scales: Optional[bool] = None
+    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
 
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
-    reasoning_parser: Optional[str] = None
-    use_tqdm_on_load: bool = True
+    reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
+    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -241,6 +277,90 @@ class EngineArgs:
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
+
+        def is_type_in_union(cls: TypeHint, type: TypeHint) -> bool:
+            """Check if the class is a type in a union type."""
+            is_union = get_origin(cls) is Union
+            type_in_union = type in [get_origin(a) or a for a in get_args(cls)]
+            return is_union and type_in_union
+
+        def get_type_from_union(cls: TypeHint, type: TypeHintT) -> TypeHintT:
+            """Get the type in a union type."""
+            for arg in get_args(cls):
+                if (get_origin(arg) or arg) is type:
+                    return arg
+            raise ValueError(f"Type {type} not found in union type {cls}.")
+
+        def is_optional(cls: TypeHint) -> TypeIs[Union[Any, None]]:
+            """Check if the class is an optional type."""
+            return is_type_in_union(cls, type(None))
+
+        def can_be_type(cls: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
+            """Check if the class can be of type."""
+            return cls is type or get_origin(cls) is type or is_type_in_union(
+                cls, type)
+
+        def is_custom_type(cls: TypeHint) -> bool:
+            """Check if the class is a custom type."""
+            return cls.__module__ != "builtins"
+
+        def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+            cls_docs = get_attr_docs(cls)
+            kwargs = {}
+            for field in fields(cls):
+                # Get the default value of the field
+                default = field.default
+                if field.default_factory is not MISSING:
+                    default = field.default_factory()
+
+                # Get the help text for the field
+                name = field.name
+                help = cls_docs[name]
+                # Escape % for argparse
+                help = help.replace("%", "%%")
+
+                # Initialise the kwargs dictionary for the field
+                kwargs[name] = {"default": default, "help": help}
+
+                # Make note of if the field is optional and get the actual
+                # type of the field if it is
+                optional = is_optional(field.type)
+                field_type = get_args(
+                    field.type)[0] if optional else field.type
+
+                # Set type, action and choices for the field depending on the
+                # type of the field
+                if can_be_type(field_type, bool):
+                    # Creates --no-<name> and --<name> flags
+                    kwargs[name]["action"] = argparse.BooleanOptionalAction
+                    kwargs[name]["type"] = bool
+                elif can_be_type(field_type, Literal):
+                    # Creates choices from Literal arguments
+                    if is_type_in_union(field_type, Literal):
+                        field_type = get_type_from_union(field_type, Literal)
+                    choices = get_args(field_type)
+                    kwargs[name]["choices"] = choices
+                    choice_type = type(choices[0])
+                    assert all(type(c) is choice_type for c in choices), (
+                        f"All choices must be of the same type. "
+                        f"Got {choices} with types {[type(c) for c in choices]}"
+                    )
+                    kwargs[name]["type"] = choice_type
+                elif can_be_type(field_type, int):
+                    kwargs[name]["type"] = optional_int if optional else int
+                elif can_be_type(field_type, float):
+                    kwargs[name][
+                        "type"] = optional_float if optional else float
+                elif can_be_type(field_type, dict):
+                    kwargs[name]["type"] = optional_dict
+                elif (can_be_type(field_type, str)
+                      or is_custom_type(field_type)):
+                    kwargs[name]["type"] = optional_str if optional else str
+                else:
+                    raise ValueError(
+                        f"Unsupported type {field.type} for argument {name}. ")
+            return kwargs
+
         # Model arguments
         parser.add_argument(
             '--model',
@@ -258,13 +378,13 @@ class EngineArgs:
             'which task to use.')
         parser.add_argument(
             '--tokenizer',
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.tokenizer,
             help='Name or path of the huggingface tokenizer to use. '
             'If unspecified, model name or path will be used.')
         parser.add_argument(
             "--hf-config-path",
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.hf_config_path,
             help='Name or path of the huggingface config to use. '
             'If unspecified, model name or path will be used.')
@@ -276,21 +396,21 @@ class EngineArgs:
             'the input. The generated output will contain token ids.')
         parser.add_argument(
             '--revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='The specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
         parser.add_argument(
             '--code-revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='The specific revision to use for the model code on '
             'Hugging Face Hub. It can be a branch name, a tag name, or a '
             'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='Revision of the huggingface tokenizer to use. '
             'It can be a branch name, a tag name, or a commit id. '
@@ -316,40 +436,22 @@ class EngineArgs:
             "from directories specified by the server file system. "
             "This is a security risk. "
             "Should only be enabled in trusted environments.")
-        parser.add_argument('--download-dir',
-                            type=nullable_str,
-                            default=EngineArgs.download_dir,
-                            help='Directory to download and load the weights, '
-                            'default to the default cache dir of '
-                            'huggingface.')
-        parser.add_argument(
-            '--load-format',
-            type=str,
-            default=EngineArgs.load_format,
-            choices=[f.value for f in LoadFormat],
-            help='The format of the model weights to load.\n\n'
-            '* "auto" will try to load the weights in the safetensors format '
-            'and fall back to the pytorch bin format if safetensors format '
-            'is not available.\n'
-            '* "pt" will load the weights in the pytorch bin format.\n'
-            '* "safetensors" will load the weights in the safetensors format.\n'
-            '* "npcache" will load the weights in pytorch format and store '
-            'a numpy cache to speed up the loading.\n'
-            '* "dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.\n'
-            '* "tensorizer" will load the weights using tensorizer from '
-            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
-            'section for more information.\n'
-            '* "runai_streamer" will load the Safetensors weights using Run:ai'
-            'Model Streamer.\n'
-            '* "bitsandbytes" will load the weights using bitsandbytes '
-            'quantization.\n'
-            '* "sharded_state" will load weights from pre-sharded checkpoint '
-            'files, supporting efficient loading of tensor-parallel models\n'
-            '* "gguf" will load weights from GGUF format files (details '
-            'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
-            '* "mistral" will load weights from consolidated safetensors files '
-            'used by Mistral models.\n')
+        # Model loading arguments
+        load_kwargs = get_kwargs(LoadConfig)
+        load_group = parser.add_argument_group(
+            title="LoadConfig",
+            description=LoadConfig.__doc__,
+        )
+        load_group.add_argument('--load-format',
+                                choices=[f.value for f in LoadFormat],
+                                **load_kwargs["load_format"])
+        load_group.add_argument('--download-dir',
+                                **load_kwargs["download_dir"])
+        load_group.add_argument('--model-loader-extra-config',
+                                **load_kwargs["model_loader_extra_config"])
+        load_group.add_argument('--use-tqdm-on-load',
+                                **load_kwargs["use_tqdm_on_load"])
+
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
@@ -372,35 +474,34 @@ class EngineArgs:
             '* "bfloat16" for a balance between precision and range.\n'
             '* "float" is shorthand for FP32 precision.\n'
             '* "float32" for FP32 precision.')
-        parser.add_argument(
-            '--kv-cache-dtype',
-            type=str,
-            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-            default=EngineArgs.kv_cache_dtype,
-            help='Data type for kv cache storage. If "auto", will use model '
-            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
         parser.add_argument('--max-model-len',
-                            type=int,
+                            type=human_readable_int,
                             default=EngineArgs.max_model_len,
                             help='Model context length. If unspecified, will '
-                            'be automatically derived from the model config.')
-        parser.add_argument(
+                            'be automatically derived from the model config. '
+                            'Supports k/m/g/K/M/G in human-readable format.\n'
+                            'Examples:\n'
+                            '- 1k → 1000\n'
+                            '- 1K → 1024\n')
+
+        # Guided decoding arguments
+        guided_decoding_kwargs = get_kwargs(DecodingConfig)
+        guided_decoding_group = parser.add_argument_group(
+            title="DecodingConfig",
+            description=DecodingConfig.__doc__,
+        )
+        guided_decoding_group.add_argument(
             '--guided-decoding-backend',
-            type=str,
-            default='xgrammar',
-            help='Which engine will be used for guided decoding'
-            ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/mlc-ai/xgrammar and '
-            'https://github.com/guidance-ai/llguidance.'
-            'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request'
-            'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release. '
-            'The default is xgrammar.')
+            **guided_decoding_kwargs["guided_decoding_backend"])
+        guided_decoding_group.add_argument(
+            "--reasoning-parser",
+            # This choices is a special case because it's not static
+            choices=list(ReasoningParserManager.reasoning_parsers),
+            **guided_decoding_kwargs["reasoning_backend"])
+
         parser.add_argument(
             '--logits-processor-pattern',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='Optional regex pattern specifying valid logits processor '
             'qualified names that can be passed with the `logits_processors` '
@@ -419,62 +520,57 @@ class EngineArgs:
             '* "transformers" will use the Transformers model '
             'implementation.\n')
         # Parallel arguments
-        parser.add_argument(
-            '--distributed-executor-backend',
-            choices=['ray', 'mp', 'uni', 'external_launcher'],
-            default=EngineArgs.distributed_executor_backend,
-            help='Backend to use for distributed model '
-            'workers, either "ray" or "mp" (multiprocessing). If the product '
-            'of pipeline_parallel_size and tensor_parallel_size is less than '
-            'or equal to the number of GPUs available, "mp" will be used to '
-            'keep processing on a single host. Otherwise, this will default '
-            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
-            'only supports Ray for distributed inference.')
-
-        parser.add_argument('--pipeline-parallel-size',
-                            '-pp',
-                            type=int,
-                            default=EngineArgs.pipeline_parallel_size,
-                            help='Number of pipeline stages.')
-        parser.add_argument('--tensor-parallel-size',
-                            '-tp',
-                            type=int,
-                            default=EngineArgs.tensor_parallel_size,
-                            help='Number of tensor parallel replicas.')
-        parser.add_argument(
-            '--enable-expert-parallel',
-            action='store_true',
-            help='Use expert parallelism instead of tensor parallelism '
-            'for MoE layers.')
-        parser.add_argument(
-            '--max-parallel-loading-workers',
-            type=int,
-            default=EngineArgs.max_parallel_loading_workers,
-            help='Load model sequentially in multiple batches, '
-            'to avoid RAM OOM when using tensor '
-            'parallel and large models.')
-        parser.add_argument(
-            '--ray-workers-use-nsight',
-            action='store_true',
-            help='If specified, use nsight to profile Ray workers.')
-        # KV cache arguments
-        parser.add_argument('--block-size',
-                            type=int,
-                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 64, 128],
-                            help='Token block size for contiguous chunks of '
-                            'tokens. This is ignored on neuron devices and '
-                            'set to ``--max-model-len``. On CUDA devices, '
-                            'only block sizes up to 32 are supported. '
-                            'On HPU devices, block size defaults to 128.')
-
-        parser.add_argument(
-            "--enable-prefix-caching",
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.enable_prefix_caching,
-            help="Enables automatic prefix caching. "
-            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
+        parallel_kwargs = get_kwargs(ParallelConfig)
+        parallel_group = parser.add_argument_group(
+            title="ParallelConfig",
+            description=ParallelConfig.__doc__,
         )
+        parallel_group.add_argument(
+            '--distributed-executor-backend',
+            **parallel_kwargs["distributed_executor_backend"])
+        parallel_group.add_argument(
+            '--pipeline-parallel-size', '-pp',
+            **parallel_kwargs["pipeline_parallel_size"])
+        parallel_group.add_argument('--tensor-parallel-size', '-tp',
+                                    **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument('--data-parallel-size', '-dp',
+                                    **parallel_kwargs["data_parallel_size"])
+        parallel_group.add_argument(
+            '--enable-expert-parallel',
+            **parallel_kwargs["enable_expert_parallel"])
+        parallel_group.add_argument(
+            '--max-parallel-loading-workers',
+            **parallel_kwargs["max_parallel_loading_workers"])
+        parallel_group.add_argument(
+            '--ray-workers-use-nsight',
+            **parallel_kwargs["ray_workers_use_nsight"])
+        parallel_group.add_argument(
+            '--disable-custom-all-reduce',
+            **parallel_kwargs["disable_custom_all_reduce"])
+
+        # KV cache arguments
+        cache_kwargs = get_kwargs(CacheConfig)
+        cache_group = parser.add_argument_group(
+            title="CacheConfig",
+            description=CacheConfig.__doc__,
+        )
+        cache_group.add_argument('--block-size', **cache_kwargs["block_size"])
+        cache_group.add_argument('--gpu-memory-utilization',
+                                 **cache_kwargs["gpu_memory_utilization"])
+        cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"])
+        cache_group.add_argument('--kv-cache-dtype',
+                                 **cache_kwargs["cache_dtype"])
+        cache_group.add_argument('--num-gpu-blocks-override',
+                                 **cache_kwargs["num_gpu_blocks_override"])
+        cache_group.add_argument("--enable-prefix-caching",
+                                 **cache_kwargs["enable_prefix_caching"])
+        cache_group.add_argument("--prefix-caching-hash-algo",
+                                 **cache_kwargs["prefix_caching_hash_algo"])
+        cache_group.add_argument('--cpu-offload-gb',
+                                 **cache_kwargs["cpu_offload_gb"])
+        cache_group.add_argument('--calculate-kv-scales',
+                                 **cache_kwargs["calculate_kv_scales"])
+
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
@@ -487,90 +583,11 @@ class EngineArgs:
                             'block manager v2) is now the default. '
                             'Setting this flag to True or False'
                             ' has no effect on vLLM behavior.')
-        parser.add_argument(
-            '--num-lookahead-slots',
-            type=int,
-            default=EngineArgs.num_lookahead_slots,
-            help='Experimental scheduling config necessary for '
-            'speculative decoding. This will be replaced by '
-            'speculative config in the future; it is present '
-            'to enable correctness tests until then.')
 
         parser.add_argument('--seed',
                             type=int,
                             default=EngineArgs.seed,
                             help='Random seed for operations.')
-        parser.add_argument('--swap-space',
-                            type=float,
-                            default=EngineArgs.swap_space,
-                            help='CPU swap space size (GiB) per GPU.')
-        parser.add_argument(
-            '--cpu-offload-gb',
-            type=float,
-            default=0,
-            help='The space in GiB to offload to CPU, per GPU. '
-            'Default is 0, which means no offloading. Intuitively, '
-            'this argument can be seen as a virtual way to increase '
-            'the GPU memory size. For example, if you have one 24 GB '
-            'GPU and set this to 10, virtually you can think of it as '
-            'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
-            'which requires at least 26GB GPU memory. Note that this '
-            'requires fast CPU-GPU interconnect, as part of the model is '
-            'loaded from CPU memory to GPU memory on the fly in each '
-            'model forward pass.')
-        parser.add_argument(
-            '--gpu-memory-utilization',
-            type=float,
-            default=EngineArgs.gpu_memory_utilization,
-            help='The fraction of GPU memory to be used for the model '
-            'executor, which can range from 0 to 1. For example, a value of '
-            '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a per-instance '
-            'limit, and only applies to the current vLLM instance.'
-            'It does not matter if you have another vLLM instance running '
-            'on the same GPU. For example, if you have two vLLM instances '
-            'running on the same GPU, you can set the GPU memory utilization '
-            'to 0.5 for each instance.')
-        parser.add_argument(
-            '--num-gpu-blocks-override',
-            type=int,
-            default=None,
-            help='If specified, ignore GPU profiling result and use this number'
-            ' of GPU blocks. Used for testing preemption.')
-        parser.add_argument('--max-num-batched-tokens',
-                            type=int,
-                            default=EngineArgs.max_num_batched_tokens,
-                            help='Maximum number of batched tokens per '
-                            'iteration.')
-        parser.add_argument(
-            "--max-num-partial-prefills",
-            type=int,
-            default=EngineArgs.max_num_partial_prefills,
-            help="For chunked prefill, the max number of concurrent \
-            partial prefills."
-            "Defaults to 1",
-        )
-        parser.add_argument(
-            "--max-long-partial-prefills",
-            type=int,
-            default=EngineArgs.max_long_partial_prefills,
-            help="For chunked prefill, the maximum number of prompts longer "
-            "than --long-prefill-token-threshold that will be prefilled "
-            "concurrently. Setting this less than --max-num-partial-prefills "
-            "will allow shorter prompts to jump the queue in front of longer "
-            "prompts in some cases, improving latency. Defaults to 1.")
-        parser.add_argument(
-            "--long-prefill-token-threshold",
-            type=float,
-            default=EngineArgs.long_prefill_token_threshold,
-            help="For chunked prefill, a request is considered long if the "
-            "prompt is longer than this number of tokens. Defaults to 4%% of "
-            "the model's context length.",
-        )
-        parser.add_argument('--max-num-seqs',
-                            type=int,
-                            default=EngineArgs.max_num_seqs,
-                            help='Maximum number of sequences per iteration.')
         parser.add_argument(
             '--max-logprobs',
             type=int,
@@ -583,7 +600,7 @@ class EngineArgs:
         # Quantization settings.
         parser.add_argument('--quantization',
                             '-q',
-                            type=nullable_str,
+                            type=optional_str,
                             choices=[*QUANTIZATION_METHODS, None],
                             default=EngineArgs.quantization,
                             help='Method used to quantize the weights. If '
@@ -604,6 +621,16 @@ class EngineArgs:
                             help='RoPE theta. Use with `rope_scaling`. In '
                             'some cases, changing the RoPE theta improves the '
                             'performance of the scaled model.')
+        parser.add_argument(
+            '--hf-token',
+            type=str,
+            nargs='?',
+            const=True,
+            default=None,
+            help='The token to use as HTTP bearer authorization'
+            ' for remote files. If `True`, will use the token '
+            'generated when running `huggingface-cli login` '
+            '(stored in `~/.huggingface`).')
         parser.add_argument('--hf-overrides',
                             type=json.loads,
                             default=EngineArgs.hf_overrides,
@@ -624,54 +651,42 @@ class EngineArgs:
                             'Additionally for encoder-decoder models, if the '
                             'sequence length of the encoder input is larger '
                             'than this, we fall back to the eager mode.')
-        parser.add_argument('--disable-custom-all-reduce',
-                            action='store_true',
-                            default=EngineArgs.disable_custom_all_reduce,
-                            help='See ParallelConfig.')
-        parser.add_argument('--tokenizer-pool-size',
-                            type=int,
-                            default=EngineArgs.tokenizer_pool_size,
-                            help='Size of tokenizer pool to use for '
-                            'asynchronous tokenization. If 0, will '
-                            'use synchronous tokenization.')
-        parser.add_argument('--tokenizer-pool-type',
-                            type=str,
-                            default=EngineArgs.tokenizer_pool_type,
-                            help='Type of tokenizer pool to use for '
-                            'asynchronous tokenization. Ignored '
-                            'if tokenizer_pool_size is 0.')
-        parser.add_argument('--tokenizer-pool-extra-config',
-                            type=nullable_str,
-                            default=EngineArgs.tokenizer_pool_extra_config,
-                            help='Extra config for tokenizer pool. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary. Ignored if '
-                            'tokenizer_pool_size is 0.')
+
+        # Tokenizer arguments
+        tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
+        tokenizer_group = parser.add_argument_group(
+            title="TokenizerPoolConfig",
+            description=TokenizerPoolConfig.__doc__,
+        )
+        tokenizer_group.add_argument('--tokenizer-pool-size',
+                                     **tokenizer_kwargs["pool_size"])
+        tokenizer_group.add_argument('--tokenizer-pool-type',
+                                     **tokenizer_kwargs["pool_type"])
+        tokenizer_group.add_argument('--tokenizer-pool-extra-config',
+                                     **tokenizer_kwargs["extra_config"])
 
         # Multimodal related configs
-        parser.add_argument(
-            '--limit-mm-per-prompt',
-            type=nullable_kvs,
-            default=EngineArgs.limit_mm_per_prompt,
-            # The default value is given in
-            # MultiModalRegistry.init_mm_limits_per_prompt
-            help=('For each multimodal plugin, limit how many '
-                  'input instances to allow for each prompt. '
-                  'Expects a comma-separated list of items, '
-                  'e.g.: `image=16,video=2` allows a maximum of 16 '
-                  'images and 2 videos per prompt. Defaults to 1 for '
-                  'each modality.'))
+        multimodal_kwargs = get_kwargs(MultiModalConfig)
+        multimodal_group = parser.add_argument_group(
+            title="MultiModalConfig",
+            description=MultiModalConfig.__doc__,
+        )
+        multimodal_group.add_argument('--limit-mm-per-prompt',
+                                      **multimodal_kwargs["limit_per_prompt"])
+
         parser.add_argument(
             '--mm-processor-kwargs',
             default=None,
             type=json.loads,
-            help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
+            help=('Overrides for the multi-modal processor obtained from '
+                  '``AutoProcessor.from_pretrained``. The available overrides '
+                  'depend on the model that is being run.'
+                  'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'))
         parser.add_argument(
             '--disable-mm-preprocessor-cache',
             action='store_true',
-            help='If true, then disables caching of the multi-modal '
-            'preprocessor/mapper. (not recommended)')
+            help='If True, disable caching of the processed multi-modal '
+            'inputs.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -704,7 +719,7 @@ class EngineArgs:
                   'base model dtype.'))
         parser.add_argument(
             '--long-lora-scaling-factors',
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.long_lora_scaling_factors,
             help=('Specify multiple scaling factors (which can '
                   'be different from base model scaling factor '
@@ -718,8 +733,7 @@ class EngineArgs:
             type=int,
             default=EngineArgs.max_cpu_loras,
             help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras. '
-                  'Defaults to max_loras.'))
+                  'Must be >= than max_loras.'))
         parser.add_argument(
             '--fully-sharded-loras',
             action='store_true',
@@ -739,173 +753,33 @@ class EngineArgs:
                             type=int,
                             default=EngineArgs.max_prompt_adapter_token,
                             help='Max number of PromptAdapters tokens')
-        parser.add_argument("--device",
-                            type=str,
-                            default=EngineArgs.device,
-                            choices=DEVICE_OPTIONS,
-                            help='Device type for vLLM execution.')
+
+        # Device arguments
+        device_kwargs = get_kwargs(DeviceConfig)
+        device_group = parser.add_argument_group(
+            title="DeviceConfig",
+            description=DeviceConfig.__doc__,
+        )
+        device_group.add_argument("--device", **device_kwargs["device"])
+
         parser.add_argument('--num-scheduler-steps',
                             type=int,
                             default=1,
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
-        parser.add_argument(
-            '--use-tqdm-on-load',
-            dest='use_tqdm_on_load',
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.use_tqdm_on_load,
-            help='Whether to enable/disable progress bar '
-            'when loading model weights.',
+
+        # Speculative arguments
+        speculative_group = parser.add_argument_group(
+            title="SpeculativeConfig",
+            description=SpeculativeConfig.__doc__,
         )
+        speculative_group.add_argument(
+            '--speculative-config',
+            type=json.loads,
+            default=None,
+            help='The configurations for speculative decoding.'
+            ' Should be a JSON string.')
 
-        parser.add_argument(
-            '--multi-step-stream-outputs',
-            action=StoreBoolean,
-            default=EngineArgs.multi_step_stream_outputs,
-            nargs="?",
-            const="True",
-            help='If False, then multi-step will stream outputs at the end '
-            'of all steps')
-        parser.add_argument(
-            '--scheduler-delay-factor',
-            type=float,
-            default=EngineArgs.scheduler_delay_factor,
-            help='Apply a delay (of delay factor multiplied by previous '
-            'prompt latency) before scheduling next prompt.')
-        parser.add_argument(
-            '--enable-chunked-prefill',
-            action=StoreBoolean,
-            default=EngineArgs.enable_chunked_prefill,
-            nargs="?",
-            const="True",
-            help='If set, the prefill requests can be chunked based on the '
-            'max_num_batched_tokens.')
-        parser.add_argument('--speculative-config',
-                            type=nullable_str,
-                            default=None,
-                            help='The configurations for speculative decoding.'
-                            ' Should be a JSON string.')
-        parser.add_argument(
-            '--speculative-model',
-            type=nullable_str,
-            default=EngineArgs.speculative_model,
-            help=
-            'The name of the draft model to be used in speculative decoding.')
-        # Quantization settings for speculative model.
-        parser.add_argument(
-            '--speculative-model-quantization',
-            type=nullable_str,
-            choices=[*QUANTIZATION_METHODS, None],
-            default=EngineArgs.speculative_model_quantization,
-            help='Method used to quantize the weights of speculative model. '
-            'If None, we first check the `quantization_config` '
-            'attribute in the model config file. If that is '
-            'None, we assume the model weights are not '
-            'quantized and use `dtype` to determine the data '
-            'type of the weights.')
-        parser.add_argument(
-            '--num-speculative-tokens',
-            type=int,
-            default=EngineArgs.num_speculative_tokens,
-            help='The number of speculative tokens to sample from '
-            'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--speculative-disable-mqa-scorer',
-            action='store_true',
-            help=
-            'If set to True, the MQA scorer will be disabled in speculative '
-            ' and fall back to batch expansion')
-        parser.add_argument(
-            '--speculative-draft-tensor-parallel-size',
-            '-spec-draft-tp',
-            type=int,
-            default=EngineArgs.speculative_draft_tensor_parallel_size,
-            help='Number of tensor parallel replicas for '
-            'the draft model in speculative decoding.')
-
-        parser.add_argument(
-            '--speculative-max-model-len',
-            type=int,
-            default=EngineArgs.speculative_max_model_len,
-            help='The maximum sequence length supported by the '
-            'draft model. Sequences over this length will skip '
-            'speculation.')
-
-        parser.add_argument(
-            '--speculative-disable-by-batch-size',
-            type=int,
-            default=EngineArgs.speculative_disable_by_batch_size,
-            help='Disable speculative decoding for new incoming requests '
-            'if the number of enqueue requests is larger than this value.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-max',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_max,
-            help='Max size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-min',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_min,
-            help='Min size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--spec-decoding-acceptance-method',
-            type=str,
-            default=EngineArgs.spec_decoding_acceptance_method,
-            choices=['rejection_sampler', 'typical_acceptance_sampler'],
-            help='Specify the acceptance method to use during draft token '
-            'verification in speculative decoding. Two types of acceptance '
-            'routines are supported: '
-            '1) RejectionSampler which does not allow changing the '
-            'acceptance rate of draft tokens, '
-            '2) TypicalAcceptanceSampler which is configurable, allowing for '
-            'a higher acceptance rate at the cost of lower quality, '
-            'and vice versa.')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-threshold',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
-            help='Set the lower bound threshold for the posterior '
-            'probability of a token to be accepted. This threshold is '
-            'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding. Defaults to 0.09')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-alpha',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
-            help='A scaling factor for the entropy-based threshold for token '
-            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
-            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
-            'i.e. 0.3')
-
-        parser.add_argument(
-            '--disable-logprobs-during-spec-decoding',
-            action=StoreBoolean,
-            default=EngineArgs.disable_logprobs_during_spec_decoding,
-            nargs="?",
-            const="True",
-            help='If set to True, token log probabilities are not returned '
-            'during speculative decoding. If set to False, log probabilities '
-            'are returned according to the settings in SamplingParams. If '
-            'not specified, it defaults to True. Disabling log probabilities '
-            'during speculative decoding reduces latency by skipping logprob '
-            'calculation in proposal sampling, target sampling, and after '
-            'accepted tokens are determined.')
-
-        parser.add_argument('--model-loader-extra-config',
-                            type=nullable_str,
-                            default=EngineArgs.model_loader_extra_config,
-                            help='Extra config for model loader. '
-                            'This will be passed to the model loader '
-                            'corresponding to the chosen load_format. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary.')
         parser.add_argument(
             '--ignore-patterns',
             action="append",
@@ -976,22 +850,43 @@ class EngineArgs:
             help="Disable async output processing. This may result in "
             "lower performance.")
 
-        parser.add_argument(
-            '--scheduling-policy',
-            choices=['fcfs', 'priority'],
-            default="fcfs",
-            help='The scheduling policy to use. "fcfs" (first come first served'
-            ', i.e. requests are handled in order of arrival; default) '
-            'or "priority" (requests are handled based on given '
-            'priority (lower value means earlier handling) and time of '
-            'arrival deciding any ties).')
-
-        parser.add_argument(
-            '--scheduler-cls',
-            default=EngineArgs.scheduler_cls,
-            help='The scheduler class to use. "vllm.core.scheduler.Scheduler" '
-            'is the default scheduler. Can be a class directly or the path to '
-            'a class of form "mod.custom_class".')
+        # Scheduler arguments
+        scheduler_kwargs = get_kwargs(SchedulerConfig)
+        scheduler_group = parser.add_argument_group(
+            title="SchedulerConfig",
+            description=SchedulerConfig.__doc__,
+        )
+        scheduler_group.add_argument(
+            '--max-num-batched-tokens',
+            **scheduler_kwargs["max_num_batched_tokens"])
+        scheduler_group.add_argument('--max-num-seqs',
+                                     **scheduler_kwargs["max_num_seqs"])
+        scheduler_group.add_argument(
+            "--max-num-partial-prefills",
+            **scheduler_kwargs["max_num_partial_prefills"])
+        scheduler_group.add_argument(
+            "--max-long-partial-prefills",
+            **scheduler_kwargs["max_long_partial_prefills"])
+        scheduler_group.add_argument(
+            "--long-prefill-token-threshold",
+            **scheduler_kwargs["long_prefill_token_threshold"])
+        scheduler_group.add_argument('--num-lookahead-slots',
+                                     **scheduler_kwargs["num_lookahead_slots"])
+        scheduler_group.add_argument('--scheduler-delay-factor',
+                                     **scheduler_kwargs["delay_factor"])
+        scheduler_group.add_argument(
+            '--enable-chunked-prefill',
+            **scheduler_kwargs["enable_chunked_prefill"])
+        scheduler_group.add_argument(
+            '--multi-step-stream-outputs',
+            **scheduler_kwargs["multi_step_stream_outputs"])
+        scheduler_group.add_argument('--scheduling-policy',
+                                     **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            "--disable-chunked-mm-input",
+            **scheduler_kwargs["disable_chunked_mm_input"])
+        parser.add_argument('--scheduler-cls',
+                            **scheduler_kwargs["scheduler_cls"])
 
         parser.add_argument(
             '--override-neuron-config',
@@ -1018,10 +913,11 @@ class EngineArgs:
                             'testing only. level 3 is the recommended level '
                             'for production.\n'
                             'To specify the full compilation config, '
-                            'use a JSON string.\n'
+                            'use a JSON string, e.g. ``{"level": 3, '
+                            '"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n'
                             'Following the convention of traditional '
-                            'compilers, using -O without space is also '
-                            'supported. -O3 is equivalent to -O 3.')
+                            'compilers, using ``-O`` without space is also '
+                            'supported. ``-O3`` is equivalent to ``-O 3``.')
 
         parser.add_argument('--kv-transfer-config',
                             type=KVTransferConfig.from_cli,
@@ -1043,7 +939,7 @@ class EngineArgs:
             'class without changing the existing functions.')
         parser.add_argument(
             "--generation-config",
-            type=nullable_str,
+            type=optional_str,
             default="auto",
             help="The folder path to the generation config. "
             "Defaults to 'auto', the generation config will be loaded from "
@@ -1070,15 +966,6 @@ class EngineArgs:
                             help="Enable sleep mode for the engine. "
                             "(only cuda platform is supported)")
 
-        parser.add_argument(
-            '--calculate-kv-scales',
-            action='store_true',
-            help='This enables dynamic calculation of '
-            'k_scale and v_scale when kv-cache-dtype is fp8. '
-            'If calculate-kv-scales is false, the scales will '
-            'be loaded from the model checkpoint if available. '
-            'Otherwise, the scales will default to 1.0.')
-
         parser.add_argument(
             "--additional-config",
             type=json.loads,
@@ -1096,16 +983,6 @@ class EngineArgs:
             "If enabled, the model will be able to generate reasoning content."
         )
 
-        parser.add_argument(
-            "--reasoning-parser",
-            type=str,
-            choices=["deepseek_r1"],
-            default=None,
-            help=
-            "Select the reasoning parser depending on the model that you're "
-            "using. This is used to parse the reasoning content into OpenAI "
-            "API format. Required for ``--enable-reasoning``.")
-
         parser.add_argument(
             "--disable-cascade-attn",
             action="store_true",
@@ -1153,6 +1030,7 @@ class EngineArgs:
             code_revision=self.code_revision,
             rope_scaling=self.rope_scaling,
             rope_theta=self.rope_theta,
+            hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
@@ -1209,58 +1087,14 @@ class EngineArgs:
         This function utilizes `speculative_config` to create a
         SpeculativeConfig object. The `speculative_config` can either be
         provided as a JSON string input via CLI arguments or directly as a
-        dictionary from the engine. If `speculative_config` is not set, this
-        function will attempt to construct a configuration dictionary using
-        certain parameters, which are scheduled for deprecation in the next
-        release. Note that in next releases, `speculative_config` must be
-        provided, and the deprecated standalone speculative-related parameters
-        will be removed.
+        dictionary from the engine.
         """
         if self.speculative_config is None:
-            if (self.speculative_model is None
-                    and self.num_speculative_tokens is None):
-                return None
+            return None
 
-            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
-            # only allow '--speculative-config' after next release
-            logger.warning_once(
-                "Please use '--speculative-config' to set all configurations "
-                "related to speculative decoding. The current method of "
-                "specifying the model through '--speculative-model' and "
-                "adding related parameters (e.g., '--num-speculative-tokens') "
-                "separately will be deprecated in the next release.")
-
-            spec_config_dict = {
-                "model": self.speculative_model,
-                "quantization": self.speculative_model_quantization,
-                "max_model_len": self.speculative_max_model_len,
-                "draft_tensor_parallel_size":
-                self.speculative_draft_tensor_parallel_size,
-                "num_speculative_tokens": self.num_speculative_tokens,
-                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
-                "disable_by_batch_size":
-                self.speculative_disable_by_batch_size,
-                "prompt_lookup_max": self.ngram_prompt_lookup_max,
-                "prompt_lookup_min": self.ngram_prompt_lookup_min,
-                "acceptance_method": self.spec_decoding_acceptance_method,
-                "posterior_threshold":
-                self.typical_acceptance_sampler_posterior_threshold,
-                "posterior_alpha":
-                self.typical_acceptance_sampler_posterior_alpha,
-                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
-            }
-
-            self.speculative_config = spec_config_dict
-        else:
-            if isinstance(self.speculative_config, str):
-                import ast
-                self.speculative_config = ast.literal_eval(
-                    self.speculative_config)
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
         # config.
-
-        assert isinstance(self.speculative_config, dict)
         self.speculative_config.update({
             "target_model_config": target_model_config,
             "target_parallel_config": target_parallel_config,
@@ -1329,6 +1163,7 @@ class EngineArgs:
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
+            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
         )
@@ -1347,6 +1182,7 @@ class EngineArgs:
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
@@ -1401,6 +1237,7 @@ class EngineArgs:
             num_lookahead_slots=num_lookahead_slots,
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
+            disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             preemption_mode=self.preemption_mode,
             num_scheduler_steps=self.num_scheduler_steps,
@@ -1427,11 +1264,13 @@ class EngineArgs:
 
         if self.qlora_adapter_name_or_path is not None and \
             self.qlora_adapter_name_or_path != "":
-            if self.model_loader_extra_config is None:
-                self.model_loader_extra_config = {}
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
+        # bitsandbytes pre-quantized model need a specific model loader
+        if model_config.quantization == "bitsandbytes":
+            self.quantization = self.load_format = "bitsandbytes"
+
         load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(
@@ -1505,7 +1344,7 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        if self.preemption_mode != EngineArgs.preemption_mode:
+        if self.preemption_mode != SchedulerConfig.preemption_mode:
             _raise_or_fallback(feature_name="--preemption-mode",
                                recommend_to_remove=True)
             return False
@@ -1516,29 +1355,25 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.scheduling_policy != EngineArgs.scheduling_policy:
+        if self.scheduling_policy != SchedulerConfig.policy:
             _raise_or_fallback(feature_name="--scheduling-policy",
                                recommend_to_remove=False)
             return False
 
-        if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
+        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
             _raise_or_fallback(feature_name="--num-scheduler-steps",
                                recommend_to_remove=True)
             return False
 
-        if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor:
+        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
             _raise_or_fallback(feature_name="--scheduler-delay-factor",
                                recommend_to_remove=True)
             return False
 
-        if self.additional_config != EngineArgs.additional_config:
-            _raise_or_fallback(feature_name="--additional-config",
-                               recommend_to_remove=False)
-            return False
-
         # Xgrammar and Guidance are supported.
         SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
@@ -1580,12 +1415,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No CPU offloading yet.
-        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
-            _raise_or_fallback(feature_name="--cpu-offload-gb",
-                               recommend_to_remove=False)
-            return False
-
         # Only Fp16 and Bf16 dtypes since we only support FA.
         V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
         if model_config.dtype not in V1_SUPPORTED_DTYPES:
@@ -1594,7 +1423,7 @@ class EngineArgs:
             return False
 
         # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
+        V1_UNSUPPORTED_QUANT = ["gguf"]
         if model_config.quantization in V1_UNSUPPORTED_QUANT:
             _raise_or_fallback(
                 feature_name=f"--quantization {model_config.quantization}",
@@ -1613,19 +1442,11 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No TransformersModel support so far.
-        if (model_config.model_impl == ModelImpl.TRANSFORMERS
-                or model_config.model_impl == "transformers"):
-            _raise_or_fallback(
-                feature_name=f"model_impl={model_config.model_impl}",
-                recommend_to_remove=False)
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
-                != EngineArgs.max_num_partial_prefills
+                != SchedulerConfig.max_num_partial_prefills
                 or self.max_long_partial_prefills
-                != EngineArgs.max_long_partial_prefills):
+                != SchedulerConfig.max_long_partial_prefills):
             _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                                recommend_to_remove=False)
             return False
@@ -1637,26 +1458,37 @@ class EngineArgs:
             return False
 
         # Only Ngram speculative decoding so far.
-        if (self.speculative_model is not None
-                or self.num_speculative_tokens is not None):
+        is_ngram_enabled = False
+        is_eagle_enabled = False
+        if self.speculative_config is not None:
             # This is supported but experimental (handled below).
-            if self.speculative_model in ("ngram", "[ngram]"):
-                pass
+            speculative_method = self.speculative_config.get("method")
+            if speculative_method:
+                if speculative_method in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+                elif speculative_method == "eagle":
+                    is_eagle_enabled = True
             else:
+                speculative_model = self.speculative_config.get("model")
+                if speculative_model in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+            if not (is_ngram_enabled or is_eagle_enabled):
+                # Other speculative decoding methods are not supported yet.
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
                 return False
 
-        # No Disaggregated Prefill so far.
-        if self.kv_transfer_config != EngineArgs.kv_transfer_config:
-            _raise_or_fallback(feature_name="--kv-transfer-config",
-                               recommend_to_remove=False)
-            return False
-
-        # No FlashInfer or XFormers so far.
+        # No XFormers so far.
         V1_BACKENDS = [
-            "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
-            "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
+            "FLASH_ATTN_VLLM_V1",
+            "FLASH_ATTN",
+            "PALLAS",
+            "PALLAS_VLLM_V1",
+            "TRITON_ATTN_VLLM_V1",
+            "TRITON_MLA",
+            "FLASHMLA",
+            "FLASHINFER",
+            "FLASHINFER_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1678,23 +1510,26 @@ class EngineArgs:
                 and _warn_or_fallback("Engine in background thread")):
             return False
 
-        # LoRA is supported on V1, but off by default for now.
-        if self.enable_lora and _warn_or_fallback("LORA"):
-            return False
-
-        # PP is supported on V1, but off by default for now.
-        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+        # PP is supported on V1 with Ray distributed executor,
+        # but off for MP distributed executor for now.
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend != "ray"):
+            name = "Pipeline Parallelism without Ray distributed executor"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
             return False
 
         # ngram is supported on V1, but off by default for now.
-        if self.speculative_model in (
-                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
+        if is_ngram_enabled and _warn_or_fallback("ngram"):
+            return False
+
+        # Eagle is under development, so we don't support it yet.
+        if is_eagle_enabled and _warn_or_fallback("Eagle"):
             return False
 
         # Non-CUDA is supported on V1, but off by default for now.
         not_cuda = not current_platform.is_cuda()
         if not_cuda and _warn_or_fallback(  # noqa: SIM103
-                current_platform.device_type):
+                current_platform.device_name):
             return False
         #############################################################
 
@@ -1717,7 +1552,7 @@ class EngineArgs:
                 is_gpu = current_platform.is_cuda()
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
-                use_spec_decode = self.speculative_model is not None
+                use_spec_decode = self.speculative_config is not None
 
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
@@ -1745,12 +1580,20 @@ class EngineArgs:
             msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
-        # Disable prefix caching for multimodal models for VLLM_V0.
-        if (model_config.is_multimodal_model and self.enable_prefix_caching):
-            logger.warning(
-                "--enable-prefix-caching is not supported for multimodal "
-                "models in V0 and has been disabled.")
-            self.enable_prefix_caching = False
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching:
+            # Disable prefix caching for multimodal models for VLLM_V0.
+            if model_config.is_multimodal_model:
+                logger.warning(
+                    "--enable-prefix-caching is not supported for multimodal "
+                    "models in V0 and has been disabled.")
+                self.enable_prefix_caching = False
+
+            # VLLM_V0 only supports builtin hash algo for prefix caching.
+            if self.prefix_caching_hash_algo == "sha256":
+                raise ValueError(
+                    "sha256 is not supported for prefix caching in V0 engine. "
+                    "Please use 'builtin'.")
 
         # Set max_num_seqs to 256 for VLLM_V0.
         if self.max_num_seqs is None:
@@ -1793,12 +1636,14 @@ class EngineArgs:
                 UsageContext.LLM_CLASS: 16384,
                 UsageContext.OPENAI_API_SERVER: 8192,
             }
+            default_max_num_seqs = 1024
         else:
             # TODO(woosuk): Tune the default values for other hardware.
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 8192,
                 UsageContext.OPENAI_API_SERVER: 2048,
             }
+            default_max_num_seqs = 256
 
         use_context_value = usage_context.value if usage_context else None
         if (self.max_num_batched_tokens is None
@@ -1809,7 +1654,6 @@ class EngineArgs:
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, use_context_value)
 
-        default_max_num_seqs = 1024
         if self.max_num_seqs is None:
             self.max_num_seqs = default_max_num_seqs
 
@@ -1866,6 +1710,47 @@ def _warn_or_fallback(feature_name: str) -> bool:
     return should_exit
 
 
+def human_readable_int(value):
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    """
+    value = value.strip()
+    match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
+    if match:
+        decimal_multiplier = {
+            'k': 10**3,
+            'm': 10**6,
+            'g': 10**9,
+        }
+        binary_multiplier = {
+            'K': 2**10,
+            'M': 2**20,
+            'G': 2**30,
+        }
+
+        number, suffix = match.groups()
+        if suffix in decimal_multiplier:
+            mult = decimal_multiplier[suffix]
+            return int(float(number) * mult)
+        elif suffix in binary_multiplier:
+            mult = binary_multiplier[suffix]
+            # Do not allow decimals with binary multipliers
+            try:
+                return int(number) * mult
+            except ValueError as e:
+                raise argparse.ArgumentTypeError("Decimals are not allowed " \
+                f"with binary suffixes like {suffix}. Did you mean to use " \
+                f"{number}{suffix.lower()} instead?") from e
+
+    # Regular plain number.
+    return int(value)
+
+
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
     return EngineArgs.add_cli_args(FlexibleArgumentParser())
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 079e2a0815296..67c7e109c9f04 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -303,8 +303,11 @@ class _AsyncLLMEngine(LLMEngine):
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
+            if not scheduler_outputs.is_empty():
+                # this will cause mamba_cache/minimax_cache failed
+                # to release finished_requests_ids of the last steps
+                finished_requests_ids = self.scheduler[
+                    virtual_engine].get_and_reset_finished_requests_ids()
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
@@ -1164,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
                                             exception=asyncio.CancelledError,
                                             verbose=self.log_requests)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        return self.engine.get_vllm_config()
+
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         return self.engine.get_model_config()
@@ -1222,8 +1229,8 @@ class AsyncLLMEngine(EngineClient):
     async def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    async def wake_up(self) -> None:
-        self.engine.wake_up()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     async def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3d019ea58c5e1..4644053785f12 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,8 +7,8 @@ from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -30,8 +30,8 @@ from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
+                         PromptType, SingletonInputs)
+from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -40,6 +40,7 @@ from vllm.model_executor.guided_decoding import (
     get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
@@ -67,6 +68,7 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
+_R = TypeVar("_R", default=Any)
 
 
 @dataclass
@@ -609,12 +611,7 @@ class LLMEngine:
         seq_id = next(self.seq_counter)
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = processed_inputs["decoder"]
-            encoder_inputs = processed_inputs["encoder"]
-        else:
-            decoder_inputs = processed_inputs
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
 
         seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                        lora_request, prompt_adapter_request)
@@ -917,6 +914,10 @@ class LLMEngine:
             scheduler.abort_seq_group(
                 request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
 
+    def get_vllm_config(self) -> VllmConfig:
+        """Gets the vllm configuration."""
+        return self.vllm_config
+
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
@@ -1942,10 +1943,10 @@ class LLMEngine:
             "Sleep mode is not enabled in the model config")
         self.model_executor.sleep(level=level)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         assert self.vllm_config.model_config.enable_sleep_mode, (
             "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up()
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
@@ -2031,30 +2032,63 @@ class LLMEngine:
 
     def _validate_model_inputs(self, inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest]):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
-        else:
-            prompt_inputs = inputs
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
 
-        if prompt_ids is None or len(prompt_ids) == 0:
-            raise ValueError("Prompt cannot be empty")
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
 
-        if self.model_config.is_multimodal_model:
-            max_prompt_len = self.model_config.max_model_len
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+        tokenizer = (None if self.tokenizer is None else
+                     self.tokenizer.get_lora_tokenizer(lora_request))
 
-            if len(prompt_ids) > max_prompt_len:
-                raise ValueError(
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
-                    f"to fit into the model (context length {max_prompt_len}). "
+        prompt_ids = prompt_inputs["prompt_token_ids"]
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
 
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
@@ -2084,8 +2118,9 @@ class LLMEngine:
             guided_decoding.backend = guided_decoding.backend or \
                 self.decoding_config.guided_decoding_backend
 
-            logger.debug("Reasoning backend: %s",
-                         self.decoding_config.reasoning_backend)
+            if self.decoding_config.reasoning_backend is not None:
+                logger.debug("Building with reasoning backend %s",
+                             self.decoding_config.reasoning_backend)
 
             processor = get_local_guided_decoding_logits_processor(
                 guided_params=guided_decoding,
@@ -2126,6 +2161,14 @@ class LLMEngine:
 
         return sampling_params
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
     from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 70f36d1290ca3..7c4265fac20b0 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -52,6 +52,11 @@ class Metrics:
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
         # System stats
         #   Scheduler State
         self.gauge_scheduler_running = self._gauge_cls(
@@ -76,14 +81,15 @@ class Metrics:
         )
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_scheduler_swapped = self._gauge_cls(
-            name="vllm:num_requests_swapped",
-            documentation=(
-                "Number of requests swapped to CPU. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_scheduler_swapped = self._gauge_cls(
+                name="vllm:num_requests_swapped",
+                documentation=(
+                    "Number of requests swapped to CPU. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
@@ -93,34 +99,33 @@ class Metrics:
             multiprocess_mode="sum")
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_cache_usage = self._gauge_cls(
-            name="vllm:cpu_cache_usage_perc",
-            documentation=(
-                "CPU KV-cache usage. 1 means 100 percent usage. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:cpu_prefix_cache_hit_rate",
-            documentation=(
-                "CPU prefix cache block hit rate. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_cpu_cache_usage = self._gauge_cls(
+                name="vllm:cpu_cache_usage_perc",
+                documentation=(
+                    "CPU KV-cache usage. 1 means 100 percent usage. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
+            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:cpu_prefix_cache_hit_rate",
+                documentation=(
+                    "CPU prefix cache block hit rate. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:gpu_prefix_cache_hit_rate",
-            documentation=("GPU prefix cache block hit rate. "
-                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
-                           "vllm:gpu_prefix_cache_queries in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:gpu_prefix_cache_hit_rate",
+                documentation=("GPU prefix cache block hit rate. "
+                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
+                               "and vllm:gpu_prefix_cache_queries in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Iteration stats
         self.counter_num_preemption = self._counter_cls(
@@ -151,7 +156,8 @@ class Metrics:
             labelnames=labelnames,
             buckets=[
                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
             ])
         self.histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
@@ -159,14 +165,14 @@ class Metrics:
             labelnames=labelnames,
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
             ])
 
         # Request stats
         #   Latency
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
         ]
         self.histogram_e2e_time_request = self._histogram_cls(
             name="vllm:e2e_request_latency_seconds",
@@ -198,33 +204,35 @@ class Metrics:
             labelnames=labelnames,
             buckets=request_latency_buckets)
         # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_time_in_queue_request = self._histogram_cls(
-            name="vllm:time_in_queue_requests",
-            documentation=(
-                "Histogram of time the request spent in the queue in seconds. "
-                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_time_in_queue_request = self._histogram_cls(
+                name="vllm:time_in_queue_requests",
+                documentation=
+                ("Histogram of time the request spent in the queue in seconds. "
+                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
+                labelnames=labelnames,
+                buckets=request_latency_buckets)
 
         # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_model_forward_time_request = self._histogram_cls(
-            name="vllm:model_forward_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model forward pass in ms. "
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
-        self.histogram_model_execute_time_request = self._histogram_cls(
-            name="vllm:model_execute_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model execute function in ms."
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_model_forward_time_request = self._histogram_cls(
+                name="vllm:model_forward_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model forward pass in ms. "
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
+            self.histogram_model_execute_time_request = self._histogram_cls(
+                name="vllm:model_execute_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model execute function in ms."
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
 
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -543,11 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
-
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
@@ -580,18 +583,20 @@ class PrometheusStatLogger(StatLoggerBase):
         # System state data
         self._log_gauge(self.metrics.gauge_scheduler_running,
                         stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_swapped,
-                        stats.num_swapped_sys)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_scheduler_swapped,
+                            stats.num_swapped_sys)
         self._log_gauge(self.metrics.gauge_scheduler_waiting,
                         stats.num_waiting_sys)
         self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                         stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-                        stats.cpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-                        stats.cpu_prefix_cache_hit_rate)
-        self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
-                        stats.gpu_prefix_cache_hit_rate)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+                            stats.cpu_cache_usage_sys)
+            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
+                            stats.cpu_prefix_cache_hit_rate)
+            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
+                            stats.gpu_prefix_cache_hit_rate)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
@@ -629,12 +634,15 @@ class PrometheusStatLogger(StatLoggerBase):
                             stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
-        self._log_histogram(self.metrics.histogram_time_in_queue_request,
-                            stats.time_in_queue_requests)
-        self._log_histogram(self.metrics.histogram_model_forward_time_request,
-                            stats.model_forward_time_requests)
-        self._log_histogram(self.metrics.histogram_model_execute_time_request,
-                            stats.model_execute_time_requests)
+        if self.metrics.show_hidden_metrics:
+            self._log_histogram(self.metrics.histogram_time_in_queue_request,
+                                stats.time_in_queue_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_forward_time_request,
+                stats.model_forward_time_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_execute_time_request,
+                stats.model_execute_time_requests)
         # Metadata
         finished_reason_counter = CollectionsCounter(
             stats.finished_reason_requests)
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index fdad53580ee7a..cafd8150bc017 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -133,8 +133,9 @@ class RPCSleepRequest(Enum):
     SLEEP_LEVEL_2 = 2
 
 
-class RPCWakeUpRequest(Enum):
-    WAKE_UP = 1
+@dataclass
+class RPCWakeUpRequest:
+    tags: Optional[list[str]] = None
 
 
 @dataclass
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index db91c5d3564a0..6e56cbdbbf8c7 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
         self._errored_with: Optional[BaseException] = None
 
         # Get the configs.
+        self.vllm_config = engine_config
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
 
@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
     async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
         return await self.tokenizer.get_lora_tokenizer_async(lora_request)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_decoding_config(self) -> DecodingConfig:
         return self.decoding_config
 
@@ -697,10 +701,10 @@ class MQLLMEngineClient(EngineClient):
         return await self._send_one_way_rpc_request(
             request=RPCSleepRequest(level), socket=self.input_socket)
 
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         return await self._send_one_way_rpc_request(
-            request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
+            request=RPCWakeUpRequest(tags), socket=self.input_socket)
 
     async def is_sleeping(self) -> bool:
         """Check whether the engine is sleeping"""
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 739cbedc2f8cc..6ed5ae0a94f1a 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -274,7 +274,7 @@ class MQLLMEngine:
                 elif isinstance(request, RPCSleepRequest):
                     self.sleep(request.value)
                 elif isinstance(request, RPCWakeUpRequest):
-                    self.wake_up()
+                    self.wake_up(request.tags)
                 elif isinstance(request, RPCIsSleepingRequest):
                     self._handle_is_sleeping_request(request)
                 else:
@@ -415,8 +415,8 @@ class MQLLMEngine:
     def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 4c5d78a43df6c..5f126c7571dc8 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
             externally (before the next schedule() call)
         """
         # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
+        # once scheduled, as a sequence is moved to FINISHED_ABORTED
         # if a client disconnects from the api server.
         seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
         if seqs is None:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d2f2c226d2fc4..7e5ac3a284522 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from typing import AsyncGenerator, List, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -220,6 +220,11 @@ class EngineClient(ABC):
         """
         ...
 
+    @abstractmethod
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        ...
+
     @abstractmethod
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
@@ -282,7 +287,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         ...
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d3613384590de..bd2c3357cdc00 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -27,15 +27,16 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
+from pydantic import TypeAdapter
 # yapf: enable
-# pydantic needs the TypedDict from typing_extensions
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
+# pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -306,7 +307,24 @@ def _detect_content_format(
         return "openai"
 
 
-def _resolve_hf_chat_template(
+def resolve_mistral_chat_template(
+    chat_template: Optional[str],
+    **kwargs: Any,
+) -> Optional[str]:
+    if chat_template is not None:
+        logger.warning_once(
+            "'chat_template' cannot be overridden for mistral tokenizer.")
+    if "add_generation_prompt" in kwargs:
+        logger.warning_once(
+            "'add_generation_prompt' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    if "continue_final_message" in kwargs:
+        logger.warning_once(
+            "'continue_final_message' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    return None
+
+def resolve_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
@@ -352,7 +370,7 @@ def _resolve_chat_template_content_format(
     trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-        hf_chat_template = _resolve_hf_chat_template(
+        hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             trust_remote_code=trust_remote_code,
@@ -435,8 +453,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         self._model_config = model_config
         self._tokenizer = tokenizer
-        self._allowed_items = (model_config.multimodal_config.limit_per_prompt
-                               if model_config.multimodal_config else {})
 
         self._items_by_modality = defaultdict[str, list[_T]](list)
 
@@ -448,6 +464,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
 
+    @property
+    def mm_registry(self):
+        return MULTIMODAL_REGISTRY
+
     @staticmethod
     @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -463,14 +483,12 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
+            if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
+                              "pixtral", "mistral3"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
@@ -478,29 +496,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "NVLM_D", "h2ovl_chat"):
+            if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
+                              "internvl_chat", "skywork_chat", "NVLM_D",
+                              "h2ovl_chat", "idefics3", "smolvlm"):
                 return "<image>"
-            if model_type == "mllama":
+            if model_type in ("mllama", "llama4"):
                 return "<|image|>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|IMAGE|><|vision_end|>"
             if model_type == "molmo":
                 return ""
-            if model_type == "idefics3":
-                return "<image>"
             if model_type == "aria":
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
             if model_type == "gemma3":
                 return "<start_of_image>"
+            if model_type == "kimi_vl":
+                return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
                 return "<|audio|>"
             if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
-            if model_type == "qwen2_audio":
+                return f"<|audio_{current_count}|>"
+            if model_type in ("qwen2_audio", "qwen2_5_omni"):
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
             if model_type == "minicpmo":
@@ -509,6 +530,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         elif modality == "video":
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|VIDEO|><|vision_end|>"
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<video>./</video>)"
             if model_type.startswith("llava"):
@@ -523,12 +546,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
-        allowed_count = self._allowed_items.get(modality, 1)
+        mm_registry = self.mm_registry
+        model_config = self.model_config
+
+        input_modality = modality.replace("_embeds", "")
+
+        if mm_registry.has_processor(model_config):
+            mm_processor = mm_registry.create_processor(model_config)
+            allowed_counts = mm_processor.info.get_allowed_mm_limits()
+            allowed_count = allowed_counts.get(input_modality, 0)
+        else:
+            mm_config = model_config.multimodal_config
+            if mm_config is None:
+                msg = "This model does not support multi-modal inputs"
+                raise ValueError(msg)
+
+            allowed_count = mm_config.get_limit_per_prompt(input_modality)
+
         current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
             raise ValueError(
                 f"At most {allowed_count} {modality}(s) may be provided in "
-                "one request.")
+                "one request. You can set `--limit-mm-per-prompt` to "
+                "increase this limit if the model supports it.")
 
         self._items_by_modality[modality].append(item)
 
@@ -840,12 +880,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
-_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
-_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
-_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
+# Need to validate url objects
+_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
+_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
+_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
 
@@ -855,19 +896,19 @@ MM_PARSER_MAP: dict[
     Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
     "text":
-    lambda part: _TextParser(part).get("text", ""),
+    lambda part: _TextParser(part).get("text", None),
     "image_url":
-    lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
+    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
-    lambda part: _ImageEmbedsParser(part).get("image_embeds", {}),
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
     "audio_url":
-    lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
-    lambda part: _InputAudioParser(part).get("input_audio", {}),
+    lambda part: _InputAudioParser(part).get("input_audio", None),
     "refusal":
-    lambda part: _RefusalParser(part).get("refusal", ""),
+    lambda part: _RefusalParser(part).get("refusal", None),
     "video_url":
-    lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
+    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
 }
 
 
@@ -986,11 +1027,11 @@ def _parse_chat_message_content_part(
     part_type, content = _parse_chat_message_content_mm_part(part)
 
     # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
-    # content is empty, log a warning and skip
-    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
+    # content is None, log a warning and skip
+    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
         logger.warning(
-            "Skipping multimodal part (type: '%s') "
-            "with empty / unparsable content.", part_type)
+            "Skipping multimodal part '%s' (type: '%s') "
+            "with empty / unparsable content.", part, part_type)
         return None
 
     if part_type in ("text", "refusal"):
@@ -1056,7 +1097,11 @@ def _parse_chat_message_content(
         if role == 'assistant':
             parsed_msg = _AssistantParser(message)
 
-            if "tool_calls" in parsed_msg:
+            # The 'tool_calls' is not None check ensures compatibility.
+            # It's needed only if downstream code doesn't strictly
+            # follow the OpenAI spec.
+            if ("tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None):
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
         elif role == "tool":
             parsed_msg = _ToolParser(message)
@@ -1140,7 +1185,7 @@ def apply_hf_chat_template(
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
-    hf_chat_template = _resolve_hf_chat_template(
+    hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
@@ -1169,20 +1214,22 @@ def apply_mistral_chat_template(
     tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
-    if chat_template is not None:
-        logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer.")
-    if "add_generation_prompt" in kwargs:
-        logger.warning_once(
-            "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
-    if "continue_final_message" in kwargs:
-        logger.warning_once(
-            "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
-
-    return tokenizer.apply_chat_template(
-        messages=messages,
-        tools=tools,
+    # The return value of resolve_mistral_chat_template is always None,
+    # and we won't use it.
+    resolve_mistral_chat_template(
+        chat_template=chat_template,
         **kwargs,
     )
+
+    try:
+        return tokenizer.apply_chat_template(
+            messages=messages,
+            tools=tools,
+            **kwargs,
+        )
+    # mistral-common uses assert statements to stop processing of input
+    # if input does not comply with the expected format.
+    # We convert those assertion errors to ValueErrors so they can be
+    # are properly caught in the preprocessing_input step
+    except AssertionError as e:
+        raise ValueError from e
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index c41b2c5867818..94fb415f581f4 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -32,6 +32,7 @@ class BenchmarkSubcommandBase(CLISubcommand):
         parser = subparsers.add_parser(
             self.name,
             help=self.help,
+            description=self.help,
             usage=f"vllm bench {self.name} [options]")
         self.add_cli_args(parser)
         return parser
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
new file mode 100644
index 0000000000000..5aca16e0b640c
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.latency import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """ The `latency` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "latency"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark the latency of a single batch of requests."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkLatencySubcommand()]
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 7583540920d30..9e857af7d6dbd 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 
+import vllm.entrypoints.cli.benchmark.latency
 import vllm.entrypoints.cli.benchmark.serve
+import vllm.entrypoints.cli.benchmark.throughput
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.utils import FlexibleArgumentParser
 
-# TODO: Add the rest of the benchmark subcommands here,
-# e.g., throughput, latency, etc.
 BENCHMARK_CMD_MODULES = [
+    vllm.entrypoints.cli.benchmark.latency,
     vllm.entrypoints.cli.benchmark.serve,
+    vllm.entrypoints.cli.benchmark.throughput,
 ]
 
 
@@ -33,6 +35,7 @@ class BenchmarkSubcommand(CLISubcommand):
         bench_parser = subparsers.add_parser(
             "bench",
             help="vLLM bench subcommand.",
+            description="vLLM bench subcommand.",
             usage="vllm bench <bench_type> [options]")
         bench_subparsers = bench_parser.add_subparsers(required=True,
                                                        dest="bench_type")
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
new file mode 100644
index 0000000000000..88ee6aa038578
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.throughput import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
+    """ The `throughput` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "throughput"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark offline inference throughput."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkThroughputSubcommand()]
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
new file mode 100644
index 0000000000000..d5f9f7e729f08
--- /dev/null
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+class CollectEnvSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "collect-env"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+        return make_arg_parser(serve_parser)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 13f2761b0db06..b7c1afce71181 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,23 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # The CLI entrypoint to vLLM.
-import os
 import signal
 import sys
 
 import vllm.entrypoints.cli.benchmark.main
+import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.logger import init_logger
+from vllm.entrypoints.utils import cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
-logger = init_logger(__name__)
-
 CMD_MODULES = [
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
     vllm.entrypoints.cli.benchmark.main,
+    vllm.entrypoints.cli.collect_env,
 ]
 
 
@@ -30,29 +29,8 @@ def register_signal_handlers():
     signal.signal(signal.SIGTSTP, signal_handler)
 
 
-def env_setup():
-    # The safest multiprocessing method is `spawn`, as the default `fork` method
-    # is not compatible with some accelerators. The default method will be
-    # changing in future versions of Python, so we should use it explicitly when
-    # possible.
-    #
-    # We only set it here in the CLI entrypoint, because changing to `spawn`
-    # could break some existing code using vLLM as a library. `spawn` will cause
-    # unexpected behavior if the code is not protected by
-    # `if __name__ == "__main__":`.
-    #
-    # References:
-    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
-    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
-    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
-    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
-        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
 def main():
-    env_setup()
+    cli_env_setup()
 
     parser = FlexibleArgumentParser(description="vLLM CLI")
     parser.add_argument('-v',
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 21a7d48b75c18..1d1bba1d49ce0 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -126,7 +126,8 @@ class ChatCommand(CLISubcommand):
             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         chat_parser = subparsers.add_parser(
             "chat",
-            help="Generate chat completions via the running API server",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
             usage="vllm chat [options]")
         _add_query_options(chat_parser)
         chat_parser.add_argument(
@@ -162,7 +163,9 @@ class CompleteCommand(CLISubcommand):
         complete_parser = subparsers.add_parser(
             "complete",
             help=("Generate text completions based on the given prompt "
-                  "via the running API server"),
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
             usage="vllm complete [options]")
         _add_query_options(complete_parser)
         return complete_parser
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index c345ece4dada9..5c8781b50d2ca 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -4,7 +4,6 @@ import argparse
 
 import uvloop
 
-from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -21,14 +20,9 @@ class ServeSubcommand(CLISubcommand):
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
-        # The default value of `--model`
-        if args.model != EngineArgs.model:
-            raise ValueError(
-                "With `vllm serve`, you should provide the model as a "
-                "positional argument instead of via the `--model` option.")
-
-        # EngineArgs expects the model name to be passed as --model.
-        args.model = args.model_tag
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
 
         uvloop.run(run_server(args))
 
@@ -40,11 +34,14 @@ class ServeSubcommand(CLISubcommand):
             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         serve_parser = subparsers.add_parser(
             "serve",
-            help="Start the vLLM OpenAI Compatible API server",
-            usage="vllm serve <model_tag> [options]")
+            help="Start the vLLM OpenAI Compatible API server.",
+            description="Start the vLLM OpenAI Compatible API server.",
+            usage="vllm serve [model_tag] [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,
-                                  help="The model tag to serve")
+                                  nargs='?',
+                                  help="The model tag to serve "
+                                  "(optional if specified in config)")
         serve_parser.add_argument(
             "--config",
             type=str,
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index b09ee526f14ae..a4f70a51ebaf3 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,9 +12,11 @@ from fastapi import FastAPI, Request, Response
 from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
@@ -40,6 +42,8 @@ async def serve_http(app: FastAPI,
 
     loop = asyncio.get_running_loop()
 
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
     server_task = loop.create_task(
         server.serve(sockets=[sock] if sock else None))
 
@@ -52,6 +56,7 @@ async def serve_http(app: FastAPI,
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
         server_task.cancel()
+        watchdog_task.cancel()
         if ssl_cert_refresher:
             ssl_cert_refresher.stop()
 
@@ -73,48 +78,69 @@ async def serve_http(app: FastAPI,
                 port, process, " ".join(process.cmdline()))
         logger.info("Shutting down FastAPI HTTP server.")
         return server.shutdown()
+    finally:
+        watchdog_task.cancel()
+
+
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
+
+
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
+    engine_errored = engine.errored and not engine.is_running
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
+        server.should_exit = True
 
 
 def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-    """Adds handlers for fatal errors that should crash the server"""
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+
+    If an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
+    """
 
     @app.exception_handler(RuntimeError)
-    async def runtime_error_handler(request: Request, __):
-        """On generic runtime error, check to see if the engine has died.
-        It probably has, in which case the server will no longer be able to
-        handle requests. Trigger a graceful shutdown with a SIGTERM."""
-        engine = request.app.state.engine_client
-        if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
-                and not engine.is_running):
-            logger.fatal("AsyncLLMEngine has failed, terminating server "
-                         "process")
-            # See discussions here on shutting down a uvicorn server
-            # https://github.com/encode/uvicorn/discussions/1103
-            # In this case we cannot await the server shutdown here because
-            # this handler must first return to close the connection for
-            # this request.
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(AsyncEngineDeadError)
-    async def async_engine_dead_handler(_, __):
-        """Kill the server if the async engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("AsyncLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(MQEngineDeadError)
-    async def mq_engine_dead_handler(_, __):
-        """Kill the server if the mq engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("MQLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
 
         return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1887caf25a30f..57c7ab73de37b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -8,7 +8,7 @@ from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
 
 import cloudpickle
 import torch.nn as nn
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from typing_extensions import TypeVar, deprecated
 
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -117,6 +117,9 @@ class LLM:
         disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
@@ -177,6 +180,7 @@ class LLM:
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
+        hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
@@ -232,6 +236,7 @@ class LLM:
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
             disable_async_output_proc=disable_async_output_proc,
+            hf_token=hf_token,
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
@@ -492,8 +497,8 @@ class LLM:
             It is recommended to use this API to only pass control messages,
             and set up data-plane communication to pass data.
         """
-        executor = self.llm_engine.model_executor
-        return executor.collective_rpc(method, timeout, args, kwargs)
+
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
@@ -531,6 +536,19 @@ class LLM:
                                          tokenizer.eos_token_id,
                                          length_penalty)
 
+        def create_tokens_prompt_from_beam(
+                beam: BeamSearchSequence) -> TokensPrompt:
+            token_prompt_kwargs: TokensPrompt = {
+                "prompt_token_ids": beam.tokens
+            }
+            if beam.multi_modal_data is not None:
+                token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data
+
+            if beam.mm_processor_kwargs is not None:
+                token_prompt_kwargs[
+                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
+            return TokensPrompt(**token_prompt_kwargs)
+
         tokenizer = self.get_tokenizer()
         # generate 2 * beam_width candidates at each step
         # following the huggingface transformers implementation
@@ -541,11 +559,20 @@ class LLM:
         instances: list[BeamSearchInstance] = []
 
         for prompt in prompts:
+            # Add multimodal processor kwargs & data
+            mm_kwargs = {}
+            if "multi_modal_data" in prompt:
+                mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
+            if "mm_processor_kwargs" in prompt:
+                mm_kwargs["mm_processor_kwargs"] = prompt[
+                    "mm_processor_kwargs"]
+
             if is_token_prompt(prompt):
                 prompt_tokens = prompt["prompt_token_ids"]
             else:
                 prompt_tokens = tokenizer.encode(prompt["prompt"])
-            instances.append(BeamSearchInstance(prompt_tokens))
+            instances.append(
+                BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs))
 
         for _ in range(max_tokens):
             all_beams: list[BeamSearchSequence] = list(
@@ -560,8 +587,7 @@ class LLM:
                 break
 
             prompts_batch = [
-                TokensPrompt(prompt_token_ids=beam.tokens)
-                for beam in all_beams
+                create_tokens_prompt_from_beam(beam) for beam in all_beams
             ]
 
             # only runs for one step
@@ -587,7 +613,10 @@ class LLM:
                                 tokens=current_beam.tokens + [token_id],
                                 logprobs=current_beam.logprobs + [logprobs],
                                 cum_logprob=current_beam.cum_logprob +
-                                logprob_obj.logprob)
+                                logprob_obj.logprob,
+                                multi_modal_data=current_beam.multi_modal_data,
+                                mm_processor_kwargs=current_beam.
+                                mm_processor_kwargs)
 
                             if token_id == tokenizer.eos_token_id and \
                                 not ignore_eos:
@@ -906,6 +935,11 @@ class LLM:
         if pooling_params is None:
             # Use default pooling params.
             pooling_params = PoolingParams()
+        elif isinstance(pooling_params, PoolingParams):
+            pooling_params.verify(self.llm_engine.model_config)
+        else:
+            for pooling_param in pooling_params:
+                pooling_param.verify(self.llm_engine.model_config)
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,
@@ -924,6 +958,8 @@ class LLM:
         /,
         *,
         use_tqdm: bool = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[EmbeddingRequestOutput]:
@@ -938,6 +974,8 @@ class LLM:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
@@ -953,6 +991,7 @@ class LLM:
 
         items = self.encode(prompts,
                             use_tqdm=use_tqdm,
+                            pooling_params=pooling_params,
                             lora_request=lora_request,
                             prompt_adapter_request=prompt_adapter_request)
 
@@ -1200,26 +1239,35 @@ class LLM:
         The caller should guarantee that no requests are being processed
         during the sleep period, before `wake_up` is called.
 
-        :param level: The sleep level. Level 1 sleep will offload the model 
-            weights and discard the kv cache. The content of kv cache is 
-            forgotten. Level 1 sleep is good for sleeping and waking up the 
-            engine to run the same model again. The model weights are backed 
-            up in CPU memory. Please make sure there's enough CPU memory to 
-            store the model weights. Level 2 sleep will discard both the model 
-            weights and the kv cache. The content of both the model weights 
-            and kv cache is forgotten. Level 2 sleep is good for sleeping and 
-            waking up the engine to run a different model or update the model, 
-            where previous model weights are not needed. It reduces CPU memory 
-            pressure.
+        Args:
+            level: The sleep level. Level 1 sleep will offload the model 
+                weights and discard the kv cache. The content of kv cache 
+                is forgotten. Level 1 sleep is good for sleeping and waking
+                up the engine to run the same model again. The model weights 
+                are backed up in CPU memory. Please make sure there's enough 
+                CPU memory to store the model weights. Level 2 sleep will 
+                discard both the model weights and the kv cache. The content 
+                of both the model weights and kv cache is forgotten. Level 2 
+                sleep is good for sleeping and waking up the engine to run a
+                different model or update the model, where previous model 
+                weights are not needed. It reduces CPU memory pressure.
         """
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         """
         Wake up the engine from sleep mode. See the :meth:`sleep` method
-        for more details."""
-        self.llm_engine.wake_up()
+        for more details.
+        
+        Args:
+            tags: An optional list of tags to reallocate the engine memory 
+                for specific memory allocations. Values must be in 
+                ("weights", "kv_cache",). If None, all memory is reallocated.
+                wake_up should be called with all tags (or None) before the 
+                engine is used again.
+        """
+        self.llm_engine.wake_up(tags)
 
     # LEGACY
     def _convert_v1_inputs(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f9b1d69a31d8c..1368195808970 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -24,18 +24,21 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import State
 from starlette.routing import Mount
 from typing_extensions import assert_never
 
 import vllm.envs as envs
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import (load_chat_template,
+                                         resolve_hf_chat_template,
+                                         resolve_mistral_chat_template)
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -65,7 +68,6 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               TranscriptionRequest,
                                               TranscriptionResponse,
                                               UnloadLoRAAdapterRequest)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -80,10 +82,13 @@ from vllm.entrypoints.openai.serving_tokenization import (
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
+                                    with_cancellation)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
@@ -305,8 +310,11 @@ def mount_metrics(app: FastAPI):
     # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
     # before prometheus_client is imported.
     # See https://prometheus.github.io/client_python/multiprocess/
-    from prometheus_client import (CollectorRegistry, make_asgi_app,
+    from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
                                    multiprocess)
+    from prometheus_fastapi_instrumentator import Instrumentator
+
+    registry = REGISTRY
 
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
@@ -315,11 +323,20 @@ def mount_metrics(app: FastAPI):
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
 
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
-    else:
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app())
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
 
     # Workaround for 307 Redirect for /metrics
     metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
@@ -671,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 
 if envs.VLLM_SERVER_DEV_MODE:
 
+    @router.get("/server_info")
+    async def show_server_info(raw_request: Request):
+        server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
+        return JSONResponse(content=server_info)
+
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(raw_request: Request):
         """
@@ -689,7 +711,6 @@ if envs.VLLM_SERVER_DEV_MODE:
     async def sleep(raw_request: Request):
         # get POST params
         level = raw_request.query_params.get("level", "1")
-        logger.info("sleep the engine with level %s", level)
         await engine_client(raw_request).sleep(int(level))
         # FIXME: in v0 with frontend multiprocessing, the sleep command
         # is sent but does not finish yet when we return a response.
@@ -697,8 +718,12 @@ if envs.VLLM_SERVER_DEV_MODE:
 
     @router.post("/wake_up")
     async def wake_up(raw_request: Request):
-        logger.info("wake up the engine")
-        await engine_client(raw_request).wake_up()
+        tags = raw_request.query_params.getlist("tags")
+        if tags == []:
+            # set to None to wake up all tags if no tags are provided
+            tags = None
+        logger.info("wake up the engine with tags: %s", tags)
+        await engine_client(raw_request).wake_up(tags)
         # FIXME: in v0 with frontend multiprocessing, the wake-up command
         # is sent but does not finish yet when we return a response.
         return Response(status_code=200)
@@ -814,7 +839,8 @@ def build_app(args: Namespace) -> FastAPI:
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
-    if token := envs.VLLM_API_KEY or args.api_key:
+    # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
+    if token := args.api_key or envs.VLLM_API_KEY:
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):
@@ -843,6 +869,22 @@ def build_app(args: Namespace) -> FastAPI:
             response.headers["X-Request-Id"] = request_id
             return response
 
+    if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
+        logger.warning("CAUTION: Enabling log response in the API Server. "
+                       "This can include sensitive information and should be "
+                       "avoided in production.")
+
+        @app.middleware("http")
+        async def log_response(request: Request, call_next):
+            response = await call_next(request)
+            response_body = [
+                section async for section in response.body_iterator
+            ]
+            response.body_iterator = iterate_in_threadpool(iter(response_body))
+            logger.info("response_body={%s}",
+                        response_body[0].decode() if response_body else None)
+            return response
+
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)
@@ -859,7 +901,7 @@ def build_app(args: Namespace) -> FastAPI:
 
 async def init_app_state(
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    vllm_config: VllmConfig,
     state: State,
     args: Namespace,
 ) -> None:
@@ -880,11 +922,31 @@ async def init_app_state(
 
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
+    state.vllm_config = vllm_config
+    model_config = vllm_config.model_config
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
-        logger.info("Using supplied chat template:\n%s",
-                    resolved_chat_template)
+        # Get the tokenizer to check official template
+        tokenizer = await engine_client.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            # The warning is logged in resolve_mistral_chat_template.
+            resolved_chat_template = resolve_mistral_chat_template(
+                chat_template=resolved_chat_template)
+        else:
+            hf_chat_template = resolve_hf_chat_template(
+                tokenizer,
+                chat_template=None,
+                tools=None,
+                trust_remote_code=model_config.trust_remote_code)
+
+            if hf_chat_template != resolved_chat_template:
+                logger.warning(
+                    "Using supplied chat template: %s\n"
+                    "It is different from official chat template '%s'. "
+                    "This discrepancy may lead to performance degradation.",
+                    resolved_chat_template, args.model)
 
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,
@@ -1016,8 +1078,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
-        model_config = await engine_client.get_model_config()
-        await init_app_state(engine_client, model_config, app.state, args)
+        vllm_config = await engine_client.get_vllm_config()
+        await init_app_state(engine_client, vllm_config, app.state, args)
 
         def _listen_addr(a: str) -> str:
             if is_valid_ipv6_address(a):
@@ -1048,15 +1110,17 @@ async def run_server(args, **uvicorn_kwargs) -> None:
         )
 
     # NB: Await server shutdown only after the backend context is exited
-    await shutdown_task
-
-    sock.close()
+    try:
+        await shutdown_task
+    finally:
+        sock.close()
 
 
 if __name__ == "__main__":
     # NOTE(simon):
     # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
     # entrypoints.
+    cli_env_setup()
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index e956920c2f9a7..af546c3032af8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@ import ssl
 from collections.abc import Sequence
 from typing import Optional, Union, get_args
 
-from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
@@ -79,7 +79,7 @@ class PromptAdapterParserAction(argparse.Action):
 
 def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="Host name.")
     parser.add_argument("--port", type=int, default=8000, help="Port number.")
@@ -108,13 +108,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=["*"],
                         help="Allowed headers.")
     parser.add_argument("--api-key",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
     parser.add_argument(
         "--lora-modules",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         nargs='+',
         action=LoRAParserAction,
@@ -126,14 +126,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         nargs='+',
         action=PromptAdapterParserAction,
         help="Prompt adapter configurations in the format name=path. "
         "Multiple adapters can be specified.")
     parser.add_argument("--chat-template",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
@@ -151,20 +151,20 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         'similar to OpenAI schema. '
         'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
-                        type=nullable_str,
+                        type=optional_str,
                         default="assistant",
                         help="The role name to return if "
                         "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The CA certificates file.")
     parser.add_argument(
@@ -180,13 +180,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     )
     parser.add_argument(
         "--root-path",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         help="FastAPI root_path when app is behind a path based routing proxy."
     )
     parser.add_argument(
         "--middleware",
-        type=nullable_str,
+        type=optional_str,
         action="append",
         default=[],
         help="Additional ASGI middleware to apply to the app. "
@@ -247,7 +247,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help='Max number of prompt characters or prompt '
                         'ID numbers being printed in log.'
-                        '\n\nDefault: Unlimited')
+                        ' The default of None means unlimited.')
 
     parser.add_argument(
         "--disable-fastapi-docs",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 032dc49d16de1..8d2ab29d221e5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -61,7 +61,7 @@ class OpenAIBaseModel(BaseModel):
             field_names = set()
             for field_name, field in cls.model_fields.items():
                 field_names.add(field_name)
-                if alias := getattr(field, 'alias', None):
+                if alias := getattr(field, "alias", None):
                     field_names.add(alias)
             cls.field_names = field_names
 
@@ -70,7 +70,8 @@ class OpenAIBaseModel(BaseModel):
             logger.warning(
                 "The following fields were present in the request "
                 "but ignored: %s",
-                data.keys() - field_names)
+                data.keys() - field_names,
+            )
         return result
 
 
@@ -234,8 +235,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     temperature: Optional[float] = None
     top_p: Optional[float] = None
     tools: Optional[list[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
-                                ChatCompletionNamedToolChoiceParam]] = "none"
+    tool_choice: Optional[Union[
+        Literal["none"],
+        Literal["auto"],
+        Literal["required"],
+        ChatCompletionNamedToolChoiceParam,
+    ]] = "none"
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
@@ -340,24 +345,28 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be either "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."))
+            "through out the inference process and return in response."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -415,13 +424,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
 
@@ -465,8 +476,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 json_schema = self.response_format.json_schema
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
-                if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,
@@ -475,7 +484,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -522,6 +532,41 @@ class ChatCompletionRequest(OpenAIBaseModel):
             tool = tools[tool_name]
             return tool.parameters
 
+        if self.tool_choice == "required":
+            # Pydantic schema generation cannot be used since the JSON schema
+            # has to be constructed for a specific instantiation of a tool list
+            # so that parameters of a function are correctly generated
+            # based on the chosen function name
+            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
+                return {
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "enum": [tool.function.name]
+                        },
+                        # parameters are always generated as '{}' in the final
+                        # output if they are missing from the request
+                        # (i.e. are None or '{}') so the schema is
+                        # updated to produce an empty object in that case
+                        "parameters": tool.function.parameters
+                        if tool.function.parameters else {
+                            "type": "object",
+                            "properties": {}
+                        }
+                    },
+                    "required": ["name", "parameters"]
+                }
+
+            json_schema = {
+                "type": "array",
+                "minItems": 1,
+                "items": {
+                    "type": "object",
+                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
+                }
+            }
+            return json_schema
+
         return None
 
     @model_validator(mode="before")
@@ -572,8 +617,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 "You can only use one kind of guided decoding "
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and data.get("tool_choice",
-                                        "none") not in ("none", "auto"):
+        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "none",
+                "auto",
+                "required",
+        ):
             raise ValueError(
                 "You can only either use guided decoding or tools, not both.")
         return data
@@ -602,12 +650,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
                     "When using `tool_choice`, `tools` must be set.")
 
             # make sure that tool choice is either a named tool
-            # OR that it's set to "auto"
-            if data["tool_choice"] != "auto" and not isinstance(
-                    data["tool_choice"], dict):
-                raise ValueError(
-                    "`tool_choice` must either be a named tool, \"auto\", "
-                    "or \"none\".")
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in [
+                    "auto", "required"
+            ] and not isinstance(data["tool_choice"], dict):
+                raise NotImplementedError(
+                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
+                    'Only named tools, "none", "auto" or "required" '\
+                    'are supported.'
+                )
 
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
@@ -722,18 +773,21 @@ class CompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be one of "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -745,6 +799,7 @@ class CompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+
     return_tokens_as_token_ids: Optional[bool] = Field(
         default=None,
         description=(
@@ -789,13 +844,15 @@ class CompletionRequest(OpenAIBaseModel):
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         max_tokens = self.max_tokens
 
         if default_sampling_params is None:
@@ -844,7 +901,8 @@ class CompletionRequest(OpenAIBaseModel):
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -942,12 +1000,14 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-embedding-extra-params
 
     def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
@@ -995,7 +1055,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     # doc: end-chat-embedding-extra-params
 
     @model_validator(mode="before")
@@ -1008,7 +1069,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         return data
 
     def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
 
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
@@ -1034,7 +1096,8 @@ class ScoreRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-score-extra-params
 
@@ -1059,7 +1122,8 @@ class RerankRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-rerank-extra-params
 
@@ -1513,14 +1577,6 @@ class TranscriptionRequest(OpenAIBaseModel):
     """
 
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
-    temperature: float = Field(default=0.0)
-    """The sampling temperature, between 0 and 1.
-
-    Higher values like 0.8 will make the output more random, while lower values
-    like 0.2 will make it more focused / deterministic. If set to 0, the model
-    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
-    to automatically increase the temperature until certain thresholds are hit.
-    """
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
@@ -1532,6 +1588,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
+    # doc: begin-transcription-extra-params
     stream: Optional[bool] = False
     """Custom field not present in the original OpenAI definition. When set, 
     it will enable output to be streamed in a similar fashion as the Chat
@@ -1540,10 +1597,51 @@ class TranscriptionRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False
+    # doc: end-transcription-extra-params
+
+    # doc: begin-transcription-sampling-params
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+
+    top_p: Optional[float] = None
+    """Enables nucleus (top-p) sampling, where tokens are selected from the 
+    smallest possible set whose cumulative probability exceeds `p`.
+    """
+
+    top_k: Optional[int] = None
+    """Limits sampling to the `k` most probable tokens at each step."""
+
+    min_p: Optional[float] = None
+    """Filters out tokens with a probability lower than `min_p`, ensuring a 
+    minimum likelihood threshold during sampling.
+    """
+
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
+    frequency_penalty: Optional[float] = 0.0
+    """The frequency penalty to use for sampling."""
+
+    repetition_penalty: Optional[float] = None
+    """The repetition penalty to use for sampling."""
+
+    presence_penalty: Optional[float] = 0.0
+    """The presence penalty to use for sampling."""
+    # doc: end-transcription-sampling-params
 
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
-        "temperature": 0,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
     }
 
     def to_sampling_params(
@@ -1555,13 +1653,35 @@ class TranscriptionRequest(OpenAIBaseModel):
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
         # Default parameters
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
 
         return SamplingParams.from_optional(temperature=temperature,
                                             max_tokens=max_tokens,
+                                            seed=self.seed,
+                                            top_p=top_p,
+                                            top_k=top_k,
+                                            min_p=min_p,
+                                            frequency_penalty=self.frequency_penalty,
+                                            repetition_penalty=repetition_penalty,
+                                            presence_penalty=self.presence_penalty,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 0d06ba3df23f9..3ffa5a32c1731 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -12,7 +12,7 @@ import torch
 from prometheus_client import start_http_server
 from tqdm import tqdm
 
-from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.logger import RequestLogger, logger
 # yapf: disable
@@ -61,7 +61,7 @@ def parse_args():
         "to the output URL.",
     )
     parser.add_argument("--response-role",
-                        type=nullable_str,
+                        type=optional_str,
                         default="assistant",
                         help="The role name to return if "
                         "`request.add_generation_prompt=True`.")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3c35a848ea3a5..dd0b67df4f15a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -2,13 +2,16 @@
 
 import asyncio
 import json
+import re
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
 from typing import Callable, Final, Optional, Union
 
 import jinja2
+import partial_json_parser
 from fastapi import Request
+from pydantic import TypeAdapter
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -21,10 +24,8 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
-    DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
-    RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+    DeltaToolCall, ErrorResponse, FunctionCall, FunctionDefinition,
+    PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -33,11 +34,13 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall)
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
-                                                truncate_tool_call_ids)
+                                                truncate_tool_call_ids,
+                                                validate_request_params)
 
 logger = init_logger(__name__)
 
@@ -151,18 +154,13 @@ class OpenAIServingChat(OpenAIServing):
 
             tool_parser = self.tool_parser
 
-            # validation for OpenAI tools
-            # tool_choice = "required" is not supported
-            if request.tool_choice == "required":
-                return self.create_error_response(
-                    "tool_choice = \"required\" is not supported!")
-
             if isinstance(tokenizer, MistralTokenizer):
                 # because of issues with pydantic we need to potentially
                 # re-serialize the tool_calls field of the request
                 # for more info: see comment in `maybe_serialize_tool_calls`
                 maybe_serialize_tool_calls(request)
                 truncate_tool_call_ids(request)
+                validate_request_params(request)
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
@@ -197,16 +195,8 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except RuntimeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -286,6 +276,122 @@ class OpenAIServingChat(OpenAIServing):
             return self.response_role
         return request.messages[-1]["role"]
 
+    @staticmethod
+    def _bracket_level(s: str, opening='{', closing='}') -> int:
+        """
+        Calculate the current level of nested brackets in a given string.
+        """
+        level = 0
+        for char in s:
+            if char == opening:
+                level += 1
+            elif char == closing:
+                level -= 1
+        return level
+
+    @staticmethod
+    def _filter_delta_text(delta_text: str,
+                           previous_text: str) -> tuple[str, bool]:
+        # remove last '},' of the tool definition stemming from the
+        # "name"/"parameters" outer object or closing ']' of the tool list
+        # count occurrences of opening and closing curly braces and
+        # once level 0 is reached stop outputting text
+        # if 0 is reached while parsing the delta_text we know the current
+        # tool will finish in this current iteration
+        bracket_level = OpenAIServingChat._bracket_level(previous_text)
+        updated_delta, passed_zero = "", False
+        for c in delta_text:
+            if c == '{':
+                bracket_level += 1
+                passed_zero = bracket_level == 0
+            elif c == '}':
+                bracket_level -= 1
+                passed_zero = bracket_level == 0
+
+            if bracket_level != 0:
+                updated_delta += c
+            else:
+                # if a comma is reached at level 0 we can stop
+                if c == ',':
+                    break
+        return updated_delta, passed_zero
+
+    def extract_tool_call_required_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        function_name_returned: bool,
+    ) -> tuple[Optional[DeltaMessage], bool]:
+        try:
+            obj = partial_json_parser.loads(current_text)
+        except partial_json_parser.core.exceptions.MalformedJSON:
+            logger.debug('not enough tokens to parse into JSON yet')
+            obj = None
+
+        # check if the current text is a valid array
+        # containing a partial tool calling object
+        # if not repeat
+        if obj is None or not isinstance(obj, list) or not len(obj) > 0:
+            function_name_returned = False
+            delta_message = None
+        else:
+            _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(
+                delta_text, previous_text)
+            # take the last tool call from the generated list
+            current_tool_call = obj[-1]
+
+            # once parameters have been generated the name is complete as well
+            if not finishes_previous_tool and ("name" not in current_tool_call
+                                               or "parameters"
+                                               not in current_tool_call):
+                function_name_returned = False
+                delta_message = None
+            else:
+                if not function_name_returned:
+                    # get partly generated arguments from the latest tool call
+                    param_match = re.search(r'.*"parameters":\s*(.*)',
+                                            current_text)
+                    arguments = param_match.group(1) if param_match else ""
+                    arguments, _ = OpenAIServingChat._filter_delta_text(
+                        arguments, previous_text)
+
+                    # if this iteration finishes a previous tool call but a
+                    # new incomplete tool is already generated, take the
+                    # previous from the list
+                    if (finishes_previous_tool
+                            and "parameters" not in current_tool_call):
+                        current_tool_call = obj[-2]
+
+                    function_name_returned = True
+                    delta_message = DeltaMessage(tool_calls=[
+                        DeltaToolCall(function=DeltaFunctionCall(
+                            name=current_tool_call["name"],
+                            arguments=arguments),
+                                      index=len(obj) - 1,
+                                      type="function")
+                    ])
+
+                else:
+                    delta_text, _ = OpenAIServingChat._filter_delta_text(
+                        delta_text, previous_text)
+
+                    if delta_text != "":
+                        delta_message = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                function=DeltaFunctionCall(
+                                    # OpenAI API returns None
+                                    # instead of name every time
+                                    name=None,
+                                    arguments=delta_text),
+                                index=len(obj) - 1,
+                                type="function")
+                        ])
+                    else:
+                        delta_message = None
+
+        return delta_message, function_name_returned
+
     async def chat_completion_stream_generator(
         self,
         request: ChatCompletionRequest,
@@ -321,6 +427,7 @@ class OpenAIServingChat(OpenAIServing):
             self._should_stream_with_reasoning_parsing(request))
 
         all_previous_token_ids: Optional[list[list[int]]]
+        function_name_returned: Optional[list[bool]] = None
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
@@ -331,6 +438,10 @@ class OpenAIServingChat(OpenAIServing):
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
+        elif request.tool_choice == "required":
+            previous_texts = [""] * num_choices
+            function_name_returned = [False] * num_choices
+            all_previous_token_ids = None
         else:
             previous_texts, all_previous_token_ids = None, None
 
@@ -530,6 +641,23 @@ class OpenAIServingChat(OpenAIServing):
                                               index=i)
                             ])
 
+                    elif request.tool_choice == "required":
+                        assert previous_texts is not None
+                        assert function_name_returned is not None
+                        previous_text = previous_texts[i]
+                        current_text = previous_text + delta_text
+                        fn_name_returned = function_name_returned[i]
+
+                        delta_message, function_name_returned[i] = (
+                            self.extract_tool_call_required_streaming(
+                                previous_text=previous_text,
+                                current_text=current_text,
+                                delta_text=delta_text,
+                                function_name_returned=fn_name_returned))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
                     elif tool_choice_auto and self.enable_reasoning:
@@ -830,10 +958,10 @@ class OpenAIServingChat(OpenAIServing):
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            if (not self.enable_auto_tools or not self.tool_parser) and \
+                (not isinstance(request.tool_choice,
+                                ChatCompletionNamedToolChoiceParam
+                                ) and request.tool_choice != "required"):
                 message = ChatMessage(role=role,
                                       reasoning_content=reasoning_content,
                                       content=content)
@@ -854,6 +982,24 @@ class OpenAIServingChat(OpenAIServing):
                             arguments=content))
                     ])
 
+            elif request.tool_choice and request.tool_choice == "required":
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
+
+                # the fields of FunctionDefinition are a superset of the
+                # tool call outputs and can be used for parsing
+                tool_calls = TypeAdapter(
+                    list[FunctionDefinition]).validate_json(output.text)
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    tool_calls=[
+                        tool_call_class(function=FunctionCall(
+                            name=tool_call.name,
+                            arguments=json.dumps(tool_call.parameters)))
+                        for tool_call in tool_calls
+                    ])
+
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 1c2c78aaf8926..ba960de17cab3 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -80,9 +80,6 @@ class OpenAIServingEmbedding(OpenAIServing):
             return error_check_ret
 
         encoding_format = request.encoding_format
-        if request.dimensions is not None:
-            return self.create_error_response(
-                "dimensions is currently not supported")
 
         model_name = self._get_model_name(request.model)
         request_id = f"embd-{self._base_request_id(raw_request)}"
@@ -99,6 +96,13 @@ class OpenAIServingEmbedding(OpenAIServing):
                     "greater than max_model_len."
                     " Please, select a smaller truncation size.")
 
+        pooling_params = request.to_pooling_params()
+
+        try:
+            pooling_params.verify(self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
         try:
             (
                 lora_request,
@@ -139,18 +143,13 @@ class OpenAIServingEmbedding(OpenAIServing):
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
+        except (ValueError, TypeError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
-            pooling_params = request.to_pooling_params()
-
             for i, engine_prompt in enumerate(engine_prompts):
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7cb4a2dce1dc0..49b346a23baf9 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,6 +10,7 @@ from fastapi import Request
 from pydantic import Field
 from starlette.datastructures import Headers
 
+import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 # yapf conflicts with isort for this block
@@ -125,18 +126,29 @@ class OpenAIServing:
         self,
         request: AnyRequest,
     ) -> Optional[ErrorResponse]:
+
+        error_response = None
+
         if self._is_model_supported(request.model):
             return None
         if request.model in [
                 lora.lora_name for lora in self.models.lora_requests
         ]:
             return None
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
+                load_result := await self.models.resolve_lora(request.model)):
+            if isinstance(load_result, LoRARequest):
+                return None
+            if isinstance(load_result, ErrorResponse) and \
+                load_result.code == HTTPStatus.BAD_REQUEST.value:
+                error_response = load_result
         if request.model in [
                 prompt_adapter.prompt_adapter_name
                 for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
-        return self.create_error_response(
+
+        return error_response or self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
@@ -537,7 +549,7 @@ class OpenAIServing:
                         lora_request: Optional[LoRARequest] = None) -> str:
         if lora_request:
             return lora_request.lora_name
-        if model_name is None:
+        if not model_name:
             return self.models.base_model_paths[0].name
         return model_name
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 38a66583022a2..74433a1a3c3f5 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -2,6 +2,8 @@
 
 import json
 import pathlib
+from asyncio import Lock
+from collections import defaultdict
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Optional, Union
@@ -15,6 +17,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               UnloadLoRAAdapterRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
@@ -63,11 +66,19 @@ class OpenAIServingModels:
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
         self.engine_client = engine_client
+        self.model_config = model_config
 
         self.static_lora_modules = lora_modules
         self.lora_requests: list[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
 
+        self.lora_resolvers: list[LoRAResolver] = []
+        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(
+        ):
+            self.lora_resolvers.append(
+                LoRAResolverRegistry.get_resolver(lora_resolver_name))
+        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
+
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
             for i, prompt_adapter in enumerate(prompt_adapters, start=1):
@@ -162,7 +173,7 @@ class OpenAIServingModels:
         except BaseException as e:
             error_type = "BadRequestError"
             status_code = HTTPStatus.BAD_REQUEST
-            if isinstance(e, ValueError) and "No adapter found" in str(e):
+            if "No adapter found" in str(e):
                 error_type = "NotFoundError"
                 status_code = HTTPStatus.NOT_FOUND
 
@@ -234,6 +245,65 @@ class OpenAIServingModels:
 
         return None
 
+    async def resolve_lora(
+            self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+        """Attempt to resolve a LoRA adapter using available resolvers.
+
+        Args:
+            lora_name: Name/identifier of the LoRA adapter
+
+        Returns:
+            LoRARequest if found and loaded successfully.
+            ErrorResponse (404) if no resolver finds the adapter.
+            ErrorResponse (400) if adapter(s) are found but none load.
+        """
+        async with self.lora_resolver_lock[lora_name]:
+            # First check if this LoRA is already loaded
+            for existing in self.lora_requests:
+                if existing.lora_name == lora_name:
+                    return existing
+
+            base_model_name = self.model_config.model
+            unique_id = self.lora_id_counter.inc(1)
+            found_adapter = False
+
+            # Try to resolve using available resolvers
+            for resolver in self.lora_resolvers:
+                lora_request = await resolver.resolve_lora(
+                    base_model_name, lora_name)
+
+                if lora_request is not None:
+                    found_adapter = True
+                    lora_request.lora_int_id = unique_id
+
+                    try:
+                        await self.engine_client.add_lora(lora_request)
+                        self.lora_requests.append(lora_request)
+                        logger.info(
+                            "Resolved and loaded LoRA adapter '%s' using %s",
+                            lora_name, resolver.__class__.__name__)
+                        return lora_request
+                    except BaseException as e:
+                        logger.warning(
+                            "Failed to load LoRA '%s' resolved by %s: %s. "
+                            "Trying next resolver.", lora_name,
+                            resolver.__class__.__name__, e)
+                        continue
+
+            if found_adapter:
+                # An adapter was found, but all attempts to load it failed.
+                return create_error_response(
+                    message=(f"LoRA adapter '{lora_name}' was found "
+                             "but could not be loaded."),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+            else:
+                # No adapter was found
+                return create_error_response(
+                    message=f"LoRA adapter {lora_name} does not exist",
+                    err_type="NotFoundError",
+                    status_code=HTTPStatus.NOT_FOUND)
+
 
 def create_error_response(
         message: str,
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 894128ee974cd..779a3eded2c16 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -136,13 +136,7 @@ class OpenAIServingPooling(OpenAIServing):
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 90c0da2a24d51..c642fc51005ea 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -89,13 +89,7 @@ class OpenAIServingTokenization(OpenAIServing):
                      request.prompt,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index d1c3afa64b96c..b81dc4e7ad7b8 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -8,11 +8,12 @@ from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
+from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser"
+    "PythonicToolParser", "Phi4MiniJsonToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 0661445639d74..f0000daa0a41c 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -70,6 +70,19 @@ class MistralToolParser(ToolParser):
                 "Mistral Tool Parser could not locate the tool call token in "
                 "the tokenizer!")
 
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
+            request.skip_special_tokens = False
+        return request
+
     def extract_tool_calls(
         self,
         model_output: str,
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
new file mode 100644
index 0000000000000..668776a832e27
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import re
+from collections.abc import Sequence
+from typing import Any, Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("phi4_mini_json")
+class Phi4MiniJsonToolParser(ToolParser):
+    """
+    Tool call parser for phi-4-mini models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token: str = "functools"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        logger.debug("Model output: %s", model_output)
+
+        pattern = r'functools\[(.*?)\]'
+        matches = re.search(pattern, model_output, re.DOTALL)
+
+        if not matches:
+            logger.debug("No function calls found")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_arr: list[dict[str, Any]] = []
+            try:
+                json_content = '[' + matches.group(1) + ']'
+
+                function_call_arr = json.loads(json_content)
+                logger.debug("Successfully extracted %d function calls",
+                             len(function_call_arr))
+            except json.JSONDecodeError as e:
+                logger.error(
+                    "Failed to parse function calls from model output: %s. "
+                    "Error: %s", model_output, str(e))
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    id=f"chatcmpl-tool-{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(
+                            raw_function_call["arguments"] if "arguments" in
+                            raw_function_call else
+                            raw_function_call["parameters"])))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Optional[DeltaMessage]:
+
+        return None
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 1b9317f16f345..9f141d6b334b6 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -28,7 +28,7 @@ class _UnexpectedAstError(Exception):
 class PythonicToolParser(ToolParser):
     """
     Tool call parser for models that produce tool calls in a pythonic style,
-    such as Llama 3.2 models.
+    such as Llama 3.2 and Llama 4 models.
 
     Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
     """
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index 7997629d461a4..acbff3258e465 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
 
 
 # partial_json_parser doesn't support extra data and
-# JSONDecorder.raw_decode doesn't support partial JSON
+# JSONDecoder.raw_decode doesn't support partial JSON
 def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
     try:
         return (partial_json_parser.loads(input_str, flags), len(input_str))
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 60cbb58af3d9a..b88c2b3a080fd 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -2,11 +2,16 @@
 
 import asyncio
 import functools
+import os
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 async def listen_for_disconnect(request: Request) -> None:
     """Returns if a disconnect message is received"""
@@ -68,13 +73,20 @@ def decrement_server_load(request: Request):
 def load_aware_call(func):
 
     @functools.wraps(func)
-    async def wrapper(*args, raw_request: Request, **kwargs):
+    async def wrapper(*args, **kwargs):
+        raw_request = kwargs.get("raw_request",
+                                 args[1] if len(args) > 1 else None)
+
+        if raw_request is None:
+            raise ValueError(
+                "raw_request required when server load tracking is enabled")
+
         if not raw_request.app.state.enable_server_load_tracking:
-            return await func(*args, raw_request=raw_request, **kwargs)
+            return await func(*args, **kwargs)
 
         raw_request.app.state.server_load_metrics += 1
         try:
-            response = await func(*args, raw_request=raw_request, **kwargs)
+            response = await func(*args, **kwargs)
         except Exception:
             raw_request.app.state.server_load_metrics -= 1
             raise
@@ -101,3 +113,24 @@ def load_aware_call(func):
         return response
 
     return wrapper
+
+
+def cli_env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
diff --git a/vllm/env_override.py b/vllm/env_override.py
new file mode 100644
index 0000000000000..71f031d1e2313
--- /dev/null
+++ b/vllm/env_override.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import torch
+
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+
+if not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # normally, we disable NCCL_CUMEM_ENABLE because it
+    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    # for more details.
+    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+    # multi-node NVLink, typically on GB200-NVL72 systems.
+    # The ultimate way to detect multi-node NVLink is to use
+    # NVML APIs, which are too expensive to call here.
+    # As an approximation, we check the existence of
+    # /dev/nvidia-caps-imex-channels, used by
+    # multi-node NVLink to communicate across nodes.
+    # This will still cost some GPU memory, but it is worthwhile
+    # because we can get very fast cross-node bandwidth with NVLink.
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1
diff --git a/vllm/envs.py b/vllm/envs.py
index b4305d9c8e22c..03a8a2b20f02e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 
 import hashlib
 import os
+import sys
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
@@ -22,6 +23,7 @@ if TYPE_CHECKING:
     S3_ACCESS_KEY_ID: Optional[str] = None
     S3_SECRET_ACCESS_KEY: Optional[str] = None
     S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
     VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
@@ -46,7 +48,7 @@ if TYPE_CHECKING:
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -73,9 +75,15 @@ if TYPE_CHECKING:
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
+    VLLM_ROCM_USE_AITER_LINEAR: bool = True
+    VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
+    VLLM_ROCM_USE_AITER_MLA: bool = True
+    VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
+    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -90,14 +98,18 @@ if TYPE_CHECKING:
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
+    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
     VLLM_DP_RANK: int = 0
+    VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
-    VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
-    VLLM_TPU_BUCKET_PADDING_GAP: int = 64
+    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
+    VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
 
 
 def get_default_cache_root():
@@ -266,6 +278,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_API_KEY":
     lambda: os.environ.get("VLLM_API_KEY", None),
 
+    # Whether to log responses from API Server for debugging
+    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
+    lower() == "true",
+
     # S3 access information, used for tensorizer to load model from S3
     "S3_ACCESS_KEY_ID":
     lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
@@ -368,15 +385,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # (previously known as ADAG) API which optimizes the
     # control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    # Note that this variable is set to 1 in V1 by default
+    # when ray distributed executor is used.
     "VLLM_USE_RAY_COMPILED_DAG":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
 
-    # If the env var is set, it uses NCCL for communication in
-    # Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
-                 ),
+    # If the env var is set, Ray Compiled Graph uses the specified
+    # channel type to communicate between workers belonging to
+    # different pipeline-parallel stages.
+    # Available options:
+    # - "auto": use the default channel type
+    # - "nccl": use NCCL for communication
+    # - "shm": use shared memory and gRPC for communication
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
+    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
 
     # If the env var is set, it enables GPU communication overlap
     # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
@@ -513,11 +536,40 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # Whether to use aiter paged attention.
+    # By default is disabled.
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
+             ("true", "1")),
+
+    # use aiter linear op if aiter ops are enabled
+    # The following list of related ops
+    # - scaled_mm (per-tensor / rowwise)
+    "VLLM_ROCM_USE_AITER_LINEAR":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter moe ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
+             ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter mla ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MLA":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
+             ("true", "1")),
+    # use rocm skinny gemms
+    "VLLM_ROCM_USE_SKINNY_GEMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
+             ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
@@ -526,6 +578,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ROCM_MOE_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
 
+    # custom paged attention kernel for MI3* cards
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
+             ("true", "1")),
+
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
@@ -594,10 +651,22 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
 
+    # Use delayed sampling for HPU to reduce host cpu overhead
+    # between each step.
+    "VLLM_HPU_USE_DELAYED_SAMPLING":
+    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
+    ("1", "true"),
+
     # Rank of the process in the data parallel setting
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
 
+    # Rank of the process in the data parallel setting.
+    # Defaults to VLLM_DP_RANK when not set.
+    "VLLM_DP_RANK_LOCAL":
+    lambda: int(
+        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
+
     # World size of the data parallel setting
     "VLLM_DP_SIZE":
     lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
@@ -614,6 +683,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 
+    # Use model_redirect to redirect the model name to a local folder.
+    # `model_redirect` can be a json file mapping the model between
+    # repo_id and local folder:
+    # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
+    # or a space separated values table file:
+    # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
+    "VLLM_MODEL_REDIRECT_PATH":
+    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
+
     # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
@@ -624,16 +702,31 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_V0_USE_OUTLINES_CACHE":
     lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 
-    # If set, disables TPU-specific optimization for top-k & top-p sampling
-    "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
-    lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
-    if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
-
     # Gap between padding buckets for the forward pass. So we have
     # 8, we will run forward pass with [16, 24, 32, ...].
     "VLLM_TPU_BUCKET_PADDING_GAP":
     lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 64,
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
+
+    # Allow use of DeepGemm kernels for fused moe ops.
+    "VLLM_USE_DEEP_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
+
+    # Control the cache sized used by the xgrammar compiler. The default
+    # of 512 MB should be enough for roughly 1000 JSON schemas.
+    # It can be changed with this variable if needed for some reason.
+    "VLLM_XGRAMMAR_CACHE_MB":
+    lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
+
+    # Control the threshold for msgspec to use 'zero copy' for
+    # serialization/deserialization of tensors. Tensors below
+    # this limit will be encoded into the msgpack buffer, and
+    # tensors above will instead be sent via a separate message.
+    # While the sending side still actually copies the tensor
+    # in all cases, on the receiving side, tensors above this
+    # limit will actually be zero-copy decoded.
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
+    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
 }
 
 # end-env-vars-definition
@@ -672,7 +765,7 @@ def compute_hash() -> str:
     variables, ensure that it is included in the factors list if
     it affects the computation graph. For example, different values
     of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that 
+    graphs, so it is included in the factors list. The env vars that
     affect the choice of different kernels or attention backends should
     also be included in the factors list.
     """
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 6f5adb4f64728..58796e5d7326c 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -51,6 +51,7 @@ class ExecutorBase(ABC):
         self.observability_config = vllm_config.observability_config
         self._init_executor()
         self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -204,20 +205,34 @@ class ExecutorBase(ABC):
         time_before_sleep = time.perf_counter()
         self.collective_rpc("sleep", kwargs=dict(level=level))
         time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
         self.is_sleeping = True
         logger.info("It took %.6f seconds to fall asleep.",
                     time_after_sleep - time_before_sleep)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         if not self.is_sleeping:
             logger.warning("Executor is not sleeping.")
             return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning("Tag %s is not in sleeping tags %s", tag,
+                                   self.sleeping_tags)
+                    return
         time_before_wakeup = time.perf_counter()
-        self.collective_rpc("wake_up")
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
         time_after_wakeup = time.perf_counter()
-        self.is_sleeping = False
-        logger.info("It took %.6f seconds to wake up.",
-                    time_after_wakeup - time_before_wakeup)
+        logger.info("It took %.6f seconds to wake up tags %s.",
+                    time_after_wakeup - time_before_wakeup,
+                    tags if tags is not None else self.sleeping_tags)
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
 
     def save_sharded_state(
         self,
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index c823ab5bf9698..9b0b98731e033 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -79,7 +79,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
             # For TPU, avoid compiling NVIDIA's NCCL
             if current_platform.is_tpu():
-                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
 
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
@@ -546,10 +546,11 @@ class RayDistributedExecutor(DistributedExecutorBase):
                              "Run `pip install ray[cgraph]` to install it.")
 
         cupy_spec = importlib.util.find_spec("cupy")
-        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+        if (cupy_spec is None
+                and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"):
             raise ValueError(
                 "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
                 "Run `pip install ray[cgraph]` and check cupy installation.")
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
@@ -557,10 +558,17 @@ class RayDistributedExecutor(DistributedExecutorBase):
         self._check_ray_cgraph_installation()
         from ray.dag import InputNode, MultiOutputNode
 
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
         logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
+
         # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
         # (it is 10 seconds by default). This is a Ray environment variable to
         # control the timeout of getting result from a compiled graph execution,
@@ -605,13 +613,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
                     ]
 
                 last_pp_rank = len(self.pp_tp_workers) - 1
-                if pp_rank < last_pp_rank:
+                if (pp_rank < last_pp_rank and
+                        envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"):
                     # Specify how intermediate tensors should be passed
                     # between pp stages, no need to specify for the last
-                    # pp stage.
-                    transport = "nccl" \
-                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
-                        else "auto"
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
                     outputs = [
                         output.with_tensor_transport(transport=transport)
                         for output in outputs
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index e195a03c5cac8..06790d8ee2f8c 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -11,6 +11,10 @@ import torch.distributed as dist
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -98,6 +102,17 @@ def set_forward_context(attn_metadata: Any,
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata)
+
+    # KVConnector: trigger (possibly async) load before forward.
+    # Each attn layer will block until the reading is complete.
+    trigger_kv_transfer = (attn_metadata is not None
+                           and has_kv_transfer_group()
+                           and is_v1_kv_transfer_group())
+    if trigger_kv_transfer:
+        kv_connector = get_kv_transfer_group()
+        assert isinstance(kv_connector, KVConnectorBase_V1)
+        kv_connector.start_load_kv(_forward_context)
+
     try:
         yield
     finally:
@@ -133,4 +148,12 @@ def set_forward_context(attn_metadata: Any,
                     logger.info(("Batchsize forward time stats "
                                  "(batchsize, count, median_time(ms)): %s"),
                                 forward_stats)
+
+        # KVConnector: each attn layer triggers (possibly async) save.
+        # Ensure all those operations complete before forward() is done.
+        if trigger_kv_transfer:
+            kv_connector = get_kv_transfer_group()
+            assert isinstance(kv_connector, KVConnectorBase_V1)
+            kv_connector.wait_for_save()
+
         _forward_context = prev_context
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index ed1056948d807..28e207de1fd39 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
 from collections.abc import Sequence
-from typing import Literal, TypedDict, Union, cast, overload
+from typing import Literal, Optional, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
 
 from vllm.utils import is_list_of
 
-from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
-                   ProcessorInputs, PromptType, SingletonPrompt, TextPrompt,
-                   TokensPrompt)
+from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -110,6 +108,14 @@ def is_explicit_encoder_decoder_prompt(
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
-def is_encoder_decoder_inputs(
-        inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]:
-    return "encoder" in inputs and "decoder" in inputs
+def split_enc_dec_inputs(
+    inputs: ProcessorInputs,
+) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+    if "encoder" in inputs and "decoder" in inputs:
+        # NOTE: This passes pyright but not mypy
+        return (
+            inputs["encoder"],  # type: ignore[typeddict-item]
+            inputs["decoder"],  # type: ignore[typeddict-item]
+        )
+
+    return None, inputs
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 33f39bedea5b5..669fb96e6653a 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -261,13 +261,13 @@ class InputPreprocessor:
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -288,14 +288,14 @@ class InputPreprocessor:
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                 lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
@@ -528,6 +528,7 @@ class InputPreprocessor:
                     prompt_token_ids=decoder_inputs_to_override[
                         "prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
             else:
@@ -536,6 +537,7 @@ class InputPreprocessor:
                     prompt=inputs["prompt"],
                     prompt_token_ids=inputs["prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
         elif inputs["type"] == "token":
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b6ceb5fb82d70..0579893e5d767 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -13,13 +13,12 @@ from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
-from .parse import is_encoder_decoder_inputs
+from .parse import split_enc_dec_inputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -329,17 +328,27 @@ class InputRegistry:
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
+        from vllm.sequence import SequenceData
 
         if mm_registry.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
-                                                     tokenizer,
                                                      disable_cache=True)
             profiler = MultiModalProfiler(processor)
-            dummy_data_factory = (profiler.get_encoder_dummy_data
-                                  if is_encoder_data else
-                                  profiler.get_decoder_dummy_data)
-            dummy_data = dummy_data_factory(seq_len)
+
+            dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
+                             if is_encoder_data else
+                             profiler.get_decoder_dummy_data(seq_len))
+            _seq_data = SequenceData.from_seqs(
+                dummy_data_v1.prompt_token_ids)  # type: ignore[attr-defined]
+
+            dummy_data = DummyData(
+                seq_data=_seq_data,
+                multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
+                                         None),
+                multi_modal_placeholders=getattr(dummy_data_v1,
+                                                 "multi_modal_placeholders",
+                                                 None),
+            )
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
@@ -462,13 +471,11 @@ class InputRegistry:
             **mm_processor_kwargs,
         )
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            self._ensure_mm_kwargs(processed_inputs["encoder"],
-                                   mm_processor_kwargs)
-            self._ensure_mm_kwargs(processed_inputs["decoder"],
-                                   mm_processor_kwargs)
-        else:
-            self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs)
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        if encoder_inputs is not None:
+            self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
+        if decoder_inputs is not None:
+            self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
 
         return processed_inputs
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 7a9d5237ab754..d9de0f3cfeb30 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -866,6 +866,11 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
                 and len(packed_modules_list) == 3)
 
 
+#TODO: Implement this
+class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
+    pass
+
+
 class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def __init__(self, base_layer: RowParallelLinear) -> None:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 22a45b60ca399..81e0741a03cf7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -272,7 +272,9 @@ class LoRAModel(AdapterModel):
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
                     f" Please verify that the loaded LoRA module is correct")
-            tensors = torch.load(lora_bin_file_path, map_location=device)
+            tensors = torch.load(lora_bin_file_path,
+                                 map_location=device,
+                                 weights_only=True)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
@@ -362,7 +364,7 @@ class LoRAModelManager(AdapterModelManager):
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
         self.model.lora_manager = self
-        self.adapter_type = 'LoRa'
+        self.adapter_type = 'LoRA'
 
     @property
     def capacity(self) -> int:
diff --git a/vllm/lora/ops/triton_ops/lora_expand.py b/vllm/lora/ops/triton_ops/lora_expand.py
index b47e491ad7ed1..eacc6fb46ebd7 100644
--- a/vllm/lora/ops/triton_ops/lora_expand.py
+++ b/vllm/lora/ops/triton_ops/lora_expand.py
@@ -136,6 +136,7 @@ def _lora_expand(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1] 
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
@@ -157,11 +158,19 @@ def _lora_expand(
             identifies the the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         offset_start (int, optional): Offset start for output_tensor. 
             Defaults to 0.
         add_inputs (bool, optional): Whether to add the input tensor to the 
             output tensor. Defaults to False.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
     for weight in lora_b_weights:
         assert weight.dtype in [torch.float16, torch.bfloat16]
@@ -170,6 +179,8 @@ def _lora_expand(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check.
+    M = inputs.size(1)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -181,7 +192,6 @@ def _lora_expand(
                                            inputs.device)
 
     K = lora_b_weights[0].shape[-1]  # K= rank
-    M = inputs.size(1)
     ADD_INPUTS = add_inputs
     MAX_LORAS = lora_ids.size(0)
     CAST_TYPE = False
@@ -263,6 +273,7 @@ def _lora_expand_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 2add1177e84c8..055e78f406f3e 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -17,6 +17,17 @@ class LoRAKernelMeta:
     num_tokens_per_lora: torch.Tensor
     lora_token_start_loc: torch.Tensor
 
+    # The V1 architecture uses the traced torch.compile graphs to execute
+    # a forward pass. Things to note about this process,
+    # 1. The tracing infers all python scalar datatype objects into a constant
+    # value.
+    # 2. The tracing cannot handle dynamic control flow. (dynamic control flow
+    # is an experimental feature in pytorch)
+    # 3. The internals of torch.ops functions are not traced.
+    # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
+    # to early exit from inside the lora_expand / lora_shrink torch operation.
+    no_lora_flag_cpu: torch.Tensor
+
     @staticmethod
     def make(max_loras: int, max_num_tokens: int,
              device: Union[torch.device, str]) -> "LoRAKernelMeta":
@@ -47,17 +58,24 @@ class LoRAKernelMeta:
         lora_token_start_loc = torch.zeros(max_loras + 2,
                                            dtype=torch.int32,
                                            device=device)
+
+        no_lora_flag_cpu = torch.tensor([False],
+                                        dtype=torch.bool,
+                                        device='cpu')
+
         return LoRAKernelMeta(
             token_lora_mapping=token_lora_mapping,
             token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
             active_lora_ids=active_lora_ids,
             num_tokens_per_lora=num_tokens_per_lora,
-            lora_token_start_loc=lora_token_start_loc)
+            lora_token_start_loc=lora_token_start_loc,
+            no_lora_flag_cpu=no_lora_flag_cpu)
 
     def _reset(self):
         self.active_lora_ids.fill_(-1)
         self.num_tokens_per_lora.fill_(0)
         self.lora_token_start_loc.fill_(0)
+        self.no_lora_flag_cpu.fill_(False)
 
     def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
         """
@@ -70,6 +88,14 @@ class LoRAKernelMeta:
 
         self._reset()
 
+        # Check and record no-lora case.
+        no_lora = torch.all(token_lora_mapping == -1)
+        self.no_lora_flag_cpu[0] = no_lora
+
+        if no_lora:
+            # Early exit. LoRA kernels will not be run.
+            return
+
         num_tokens = token_lora_mapping.size(0)
 
         # copy token lora mapping
@@ -85,7 +111,7 @@ class LoRAKernelMeta:
 
         # active_lora_ids, num_tokens_per_lora
         lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
-                                                     sorted=False,
+                                                     sorted=True,
                                                      return_counts=True)
         self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
                                                       non_blocking=True)
@@ -100,7 +126,7 @@ class LoRAKernelMeta:
     def meta_args(
         self, token_nums: int
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-               torch.Tensor]:
+               torch.Tensor, torch.Tensor]:
         """
         This function returns the kernel metadata required for the current
         forward pass execution of the kernel. The function returns all the
@@ -111,7 +137,11 @@ class LoRAKernelMeta:
             token_nums (int): Number of input tokens in the current forward
             pass. 
         """
-        return (self.token_lora_mapping[:token_nums],
-                self.token_indices_sorted_by_lora_ids[:token_nums],
-                self.num_tokens_per_lora, self.lora_token_start_loc,
-                self.active_lora_ids)
+        return (
+            self.token_lora_mapping[:token_nums],
+            self.token_indices_sorted_by_lora_ids[:token_nums],
+            self.num_tokens_per_lora,
+            self.lora_token_start_loc,
+            self.active_lora_ids,
+            self.no_lora_flag_cpu,
+        )
diff --git a/vllm/lora/ops/triton_ops/lora_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink.py
index a97c50c44f47a..82331939d859b 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink.py
@@ -106,6 +106,7 @@ def _lora_shrink(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
     scaling: float,
 ) -> None:
     """
@@ -126,8 +127,16 @@ def _lora_shrink(
             identifies the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         scaling (float): Scaling factor.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
     for weight in lora_a_weights:
@@ -138,6 +147,8 @@ def _lora_shrink(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check
+    M = inputs.size(0)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -146,7 +157,6 @@ def _lora_shrink(
     (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
      lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
     N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
-    M = inputs.size(0)
     NUM_SLICES = len(lora_a_weights)
     MAX_LORAS = lora_ids.size(0)
 
@@ -218,6 +228,7 @@ def _lora_shrink_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     scaling: float,
 ) -> None:
     return
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
new file mode 100644
index 0000000000000..6726ca9a903ff
--- /dev/null
+++ b/vllm/lora/resolver.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AbstractSet, Dict, Optional
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+
+logger = init_logger(__name__)
+
+
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+
+
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: Dict[str, LoRAResolver] = field(default_factory=dict)
+
+    def get_supported_resolvers(self) -> AbstractSet[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+
+        self.resolvers[resolver_name] = resolver
+
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+
+
+LoRAResolverRegistry = _LoRAResolverRegistry()
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 0c26a60588c88..d4ee1be9a445d 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -5,10 +5,10 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import get_reasoner
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.reasoning import ReasoningParserManager
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -33,6 +33,12 @@ def maybe_backend_fallback(
         logger.warning("%s Falling back to use %s instead.", message, fallback)
         guided_params.backend = fallback
 
+    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
+    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
+    # fallbacks enabled for that and it is the V0 default.
+    if guided_params.backend == "auto":
+        guided_params.backend = "xgrammar"
+
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
     if guided_params.backend_name == "lm-format-enforcer":
         if guided_params.grammar is not None:
@@ -53,14 +59,9 @@ def maybe_backend_fallback(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
 
-        # xgrammar doesn't support regex, fallback to outlines
-        if guided_params.regex is not None:
-            fallback_or_error(
-                guided_params,
-                "xgrammar does not support regex guided decoding.", "outlines")
         # xgrammar doesn't support some JSON schema features
-        elif (guided_params.json is not None
-              and has_xgrammar_unsupported_json_features(guided_params.json)):
+        if (guided_params.json is not None and
+                has_xgrammar_unsupported_json_features(guided_params.json)):
             fallback_or_error(
                 guided_params,
                 "xgrammar does not support advanced JSON schema features like "
@@ -79,12 +80,6 @@ def maybe_backend_fallback(
                     "xgrammar does not support Lark grammars and the "
                     "grammar failed to convert to GBNF.", "outlines")
 
-        elif guided_params.json_object:
-            # https://github.com/mlc-ai/xgrammar/issues/256
-            fallback_or_error(guided_params,
-                              "xgrammar does not support json_object.",
-                              "guidance")
-
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
@@ -107,7 +102,11 @@ async def get_guided_decoding_logits_processor(
         model_config: ModelConfig,
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
 
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     guided_params = maybe_backend_fallback(guided_params)
 
@@ -146,8 +145,11 @@ def get_local_guided_decoding_logits_processor(
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
 
-    # Get the reasoner if needed, it will be None if reasoning_
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index d8675a14030de..f19ebcbe420e3 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -18,14 +18,22 @@ def get_local_guidance_guided_decoding_logits_processor(
     """
 
     grm = ""
+    any_whitespace = 'disable-any-whitespace' not in \
+        guided_params.backend_options()
     if guided_params.json:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             guided_params.json,
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.json_object:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             '{"type": "object"}',
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.regex:
         grm = llguidance.grammar_from("regex", guided_params.regex)
     elif guided_params.choice:
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 97f63ae11f457..564f9277a83c6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -12,7 +12,7 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
+from vllm.reasoning import ReasoningParser
 from vllm.sampling_params import GuidedDecodingParams
 
 
@@ -61,7 +61,7 @@ _MAX_THREADPOOL_WORKERS = 16
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -92,7 +92,7 @@ async def get_outlines_guided_decoding_logits_processor(
 def get_local_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -141,7 +141,7 @@ def _get_logits_processor(
     tokenizer: PreTrainedTokenizerBase,
     mode: GuidedDecodingMode,
     whitespace_pattern: Union[str, None],
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
     if mode == GuidedDecodingMode.JSON:
         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 8b2a0f4cfe64b..31af4593f1123 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -34,8 +34,8 @@ from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
@@ -49,9 +49,9 @@ else:
 
 class BaseLogitsProcessor:
 
-    def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
+    def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]):
         self._guide: Guide = guide
-        self._reasoner: Optional[Reasoner] = reasoner
+        self._reasoner: Optional[ReasoningParser] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -69,7 +69,7 @@ class BaseLogitsProcessor:
                 # Remove the reasoning tokens from the input_ids
                 # We need this because our implementation relies on the
                 # hash of the input_ids to store the FSM state.
-                input_ids = self._reasoner.extract_content(input_ids)
+                input_ids = self._reasoner.extract_content_ids(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
@@ -142,7 +142,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
         self,
         regex_string: str,
         tokenizer: PreTrainedTokenizerBase,
-        reasoner: Optional[Reasoner],
+        reasoner: Optional[ReasoningParser],
     ):
         """Compile the FSM that drives the regex-structured generation.
 
@@ -163,7 +163,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
     def __init__(self, schema: Union[str, Dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
                  whitespace_pattern: Union[str, None],
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -203,7 +203,7 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
         return CFGGuide(cfg, tokenizer)
 
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the context free grammar generation.
 
         Parameters
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index d930d3dbe94c1..ab6e47c007d20 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -19,6 +19,10 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
         return None
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
+    elif reasoning_backend == "granite":
+        logger.warning(
+            "Granite reasoner not yet implemented for structured outputs")
+        return None
     else:
         # Raise a warning for unknown reasoning backend and return None
         # We cannot raise an error here because some reasoning models
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
deleted file mode 100644
index 7e61e6a9620c7..0000000000000
--- a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
-
-
-@dataclass
-class DeepSeekReasoner(Reasoner):
-    """
-    Reasoner for DeepSeek R series models.
-    """
-    start_token_id: int
-    end_token_id: int
-
-    start_token: str = "<think>"
-    end_token: str = "</think>"
-
-    @classmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        return cls(start_token_id=tokenizer.encode(
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
-
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract the content after the end tokens
-        """
-        if self.end_token_id not in input_ids or \
-            input_ids.index(self.end_token_id) + 1 == len(input_ids):
-            return []
-        else:
-            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
deleted file mode 100644
index df21b1db62218..0000000000000
--- a/vllm/model_executor/guided_decoding/reasoner/reasoner.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-
-@dataclass
-class Reasoner(ABC):
-
-    @abstractmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        pass
-
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        pass
-
-    @abstractmethod
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        pass
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 10981776e768c..ba7c102526997 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -14,10 +14,6 @@ def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
         if "pattern" in obj:
             return True
 
-        # Check for enum restrictions
-        if "enum" in obj:
-            return True
-
         # Check for numeric ranges
         if obj.get("type") in ("integer", "number") and any(
                 key in obj for key in [
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index bc156223953e0..ff223c3c9b83e 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
 
 import torch
 
+import vllm.envs
 from vllm.logger import init_logger
 
 try:
@@ -27,7 +28,7 @@ if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
     from vllm.config import ModelConfig
-    from vllm.model_executor.guided_decoding.reasoner import Reasoner
+    from vllm.reasoning import ReasoningParser
     from vllm.sampling_params import GuidedDecodingParams
 
 logger = init_logger(__name__)
@@ -37,7 +38,7 @@ def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
         model_config: ModelConfig,
-        reasoner: Reasoner | None,
+        reasoner: ReasoningParser | None,
         max_threads: int = 8):
     config = GrammarConfig.from_guided_params(guided_params=guided_params,
                                               model_config=model_config,
@@ -131,8 +132,13 @@ class GrammarCompilerCache:
                 encoded_vocab=config_data.encoded_vocab,
                 metadata=config_data.metadata,
             )
+            cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
             cls._cache[cache_key] = xgr.GrammarCompiler(
-                tokenizer_info, max_threads=config.max_threads)
+                tokenizer_info,
+                max_threads=config.max_threads,
+                cache_enabled=True,
+                cache_limit_bytes=cache_size,
+            )
 
         return cls._cache[cache_key]
 
@@ -146,6 +152,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     any_whitespace: bool = True
+    regex_str: str | None = None
     max_threads: int = 8
 
     @classmethod
@@ -249,6 +256,13 @@ class GrammarConfig:
                 max_threads=max_threads,
                 tokenizer_data=tokenizer_data,
             )
+        elif guided_params.regex:
+            return cls(
+                regex_str=guided_params.regex,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -280,7 +294,7 @@ class GrammarConfig:
 class XGrammarLogitsProcessor:
     """Wrapper class to support pickle protocol"""
     config: GrammarConfig
-    reasoner: Reasoner | None = None
+    reasoner: ReasoningParser | None = None
 
     ctx: xgr.CompiledGrammar | None = None
     tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
@@ -320,7 +334,12 @@ class XGrammarLogitsProcessor:
             elif self.config.grammar_str is not None:
                 self.ctx = compiler.compile_grammar(self.config.grammar_str)
             elif self.config.json_object:
-                self.ctx = compiler.compile_builtin_json_grammar()
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema('{"type": "object"}',
+                                         any_whitespace=any_whitespace)
+            elif self.config.regex_str:
+                self.ctx = compiler.compile_regex(self.config.regex_str)
             else:
                 raise ValueError(
                     "Invalid configuration for xgrammar logits processor")
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1de0f499c1a69..f082afb7e9c0c 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -354,6 +354,7 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
     "gelu": lambda: GeluAndMul(),
     "silu": lambda: SiluAndMul(),
+    "gelu_and_mul": lambda: GeluAndMul(),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 6f933c3fa3c9f..9829ccdb384ff 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -35,6 +35,8 @@ if HAS_TRITON:
     # import to register the custom ops
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        cutlass_moe_fp8)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
@@ -45,4 +47,5 @@ if HAS_TRITON:
         "fused_experts",
         "get_config_file_name",
         "grouped_topk",
+        "cutlass_moe_fp8",
     ]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..555d173644522
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..f10e39482e584
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
new file mode 100644
index 0000000000000..04420388cca39
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..857d11e488917
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
new file mode 100644
index 0000000000000..960c7f8348571
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Fused MoE kernel."""
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+
+
+#TODO make the grouped gemm kernel consistent with scaled gemm kernel
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.half,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - ab_strides1 (torch.Tensor): The input and weights strides of the first
+        grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - ab_strides2 (torch.Tensor): The input and weights strides of the second
+        grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - out_dtype (torch.Tensor): The output tensor type.
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
+        0], "Input scale shape mismatch"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1_q.shape[2], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2_q.shape[2], "W2 scale shape mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+    assert ab_strides1.shape[0] == w1_q.shape[
+        0], "AB Strides 1 expert number mismatch"
+    assert c_strides1.shape[0] == w1_q.shape[
+        0], "C Strides 1 expert number mismatch"
+    assert ab_strides2.shape[0] == w2_q.shape[
+        0], "AB Strides 2 expert number  mismatch"
+    assert c_strides2.shape[0] == w2_q.shape[
+        0], "C Strides 2 expert number mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    local_topk_ids = topk_ids_
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids_] != -1,
+                                     expert_map[topk_ids_], -1)
+
+    topk = local_topk_ids.size(1)
+
+    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        a2_scale.numel() != 1 if a2_scale is not None else False)
+    if apply_router_weight_on_input:
+        assert topk == 1, \
+            "apply_router_weight_on_input is only implemented for topk=1"
+        # TODO: this only works for topK=1, will need to update for topK>1
+        a = a * topk_weights.to(out_dtype)
+
+    a_q, a1_scale = ops.scaled_fp8_quant(
+        a, a1_scale, use_per_token_if_dynamic=per_act_token)
+    device = a_q.device
+
+    expert_offsets = torch.empty((num_experts + 1),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
+    a_map_initializer = torch.empty
+    c2_initializer = torch.empty
+    if expert_map is not None:
+        # With expert_map each Rank processes only a subset of experts. As
+        # a result not all of a_map and c2 tensors are filled. We fill it
+        # zeros for correctness.
+        a_map_initializer = torch.zeros
+        c2_initializer = torch.zeros
+
+    a_map = a_map_initializer((local_topk_ids.numel()),
+                              dtype=torch.int32,
+                              device=device)
+    c_map = torch.empty((local_topk_ids.numel()),
+                        dtype=torch.int32,
+                        device=device)
+
+    ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, num_experts, n,
+                                k)
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = c2_initializer((m * topk, k), device=device, dtype=out_dtype)
+
+    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
+                       expert_offsets[:-1], problem_sizes1, ab_strides1,
+                       ab_strides1, c_strides1)
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intemediate_q, a2_scale = ops.scaled_fp8_quant(
+        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
+                       expert_offsets[:-1], problem_sizes2, ab_strides2,
+                       ab_strides2, c_strides2)
+    # Gather tokens
+    c2 = c2[c_map].view(m, topk, k)
+    if not apply_router_weight_on_input:
+        c2 = c2 * topk_weights.view(m, topk, 1).to(out_dtype)
+    return c2.sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
new file mode 100644
index 0000000000000..353c8cc9d59fb
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
+                                                        _fp8_quantize,
+                                                        _resize_cache)
+from vllm.utils import round_up
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _valid_deep_gemm(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     expert_map: Optional[torch.Tensor] = None) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm:
+        return False
+
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    # Expert maps not supported yet.
+    if expert_map is not None:
+        return False
+
+    align = dg.get_m_alignment_for_contiguous_layout()
+    M = hidden_states.shape[0]
+    _, K, N = w2.shape
+
+    # For now, disable DeepGemm for small N until better permute/unpermute
+    # ops are available.
+    if N <= 512:
+        return False
+
+    if align > M or N % align != 0 or K % align != 0:
+        return False
+
+    return (hidden_states.is_contiguous() and w1.is_contiguous()
+            and w2.is_contiguous())
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    block_m: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor]]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    top_k_num = curr_topk_ids.shape[1]
+
+    tokens_in_chunk, _ = curr_hidden_states.shape
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk_weight: torch.Tensor,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M, topk = topk_weight.shape
+    K = curr_hidden.shape[1]
+    curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    assert expert_map is None, "Expert maps not supported yet"
+
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
+        0] == hidden_states.shape[0], "Input scale shape mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    K = w2.shape[1]
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+
+    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    block_m = dg.get_m_alignment_for_contiguous_layout()
+    block_shape = [block_m, block_m]
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+
+    # We attempt to transpose and align offline in Fp8MoEMethod, in which
+    # case these calls will be nops.  Otherwise, they'll be performed every
+    # time the layer is executed.
+    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
+    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
+
+    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
+    M_sum = round_up(M_sum, block_m)
+
+    num_chunks = (num_tokens // CHUNK_SIZE) + 1
+
+    # We can reuse the memory between cache1 and cache3 because by the time
+    # we need cache3, we're done with cache1
+    workspace13 = torch.empty(M_sum * max(N, K),
+                              device=hidden_states.device,
+                              dtype=hidden_states.dtype)
+
+    workspace1 = workspace13[:M_sum * N].view(M_sum, N)
+    workspace2 = torch.empty((M_sum, N // 2),
+                             device=hidden_states.device,
+                             dtype=hidden_states.dtype)
+    workspace3 = workspace13[:M_sum * K].view(M_sum, K)
+
+    for chunk in range(num_chunks):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        a1q_scale: Optional[torch.Tensor] = None
+
+        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
+                                                       a1_scale, block_shape)
+
+        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
+                                  curr_topk_ids, global_num_experts,
+                                  expert_map, block_m)
+
+        # Adjust the intermediate cache size and config for the last chunk.
+        # Note that in most cases we only have one chunk so the cache size
+        # and config are already set correctly and do not need to be adjusted.
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            curr_M = sorted_token_ids.numel()
+            workspace1 = _resize_cache(workspace1, (curr_M, N))
+            workspace2 = _resize_cache(workspace2, (curr_M, N // 2))
+            workspace3 = _resize_cache(workspace3, (curr_M, K))
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qcurr_hidden_states, a1q_scale), (w1, w1_scale), workspace1,
+            expert_ids)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(workspace2, workspace1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(workspace2, workspace1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qworkspace2, a2q_scale = _fp8_quantize(workspace2, a2_scale,
+                                               block_shape)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qworkspace2, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+
+        _moe_unpermute_and_reduce(
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            workspace3.view(*workspace3.shape), inv_perm, curr_topk_weights)
+
+    return out_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index ee158d7ee474e..62614a59cbe9a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -5,17 +5,16 @@ from typing import Optional
 
 import torch
 
+import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import direct_register_custom_op
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
     if has_zp:
-        assert num_bits == 4
-        return scalar_types.uint4
+        return scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
     else:
         return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
 
@@ -27,9 +26,12 @@ def single_marlin_moe(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
     g_idx: Optional[torch.Tensor] = None,
     sort_indices: Optional[torch.Tensor] = None,
     w_zeros: Optional[torch.Tensor] = None,
+    workspace: Optional[torch.Tensor] = None,
     num_bits: int = 8,
     is_k_full: bool = True,
 ) -> torch.Tensor:
@@ -62,7 +64,7 @@ def single_marlin_moe(
     assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w.is_contiguous(), "Expert weights must be contiguous"
-    assert hidden_states.dtype == torch.float16
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
 
     M, K = hidden_states.shape
@@ -83,39 +85,54 @@ def single_marlin_moe(
 
     block_size_m = config['BLOCK_SIZE_M']
 
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = \
+        moe_align_block_size(topk_ids, block_size_m, E, expert_map)
 
-    max_workspace_size = (N // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device=hidden_states.device,
-                            requires_grad=False)
+    if workspace is None:
+        max_workspace_size = (max(2 * N, K) // 64) * \
+            (sorted_token_ids.size(0) // block_size_m)
+        device = hidden_states.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        max_workspace_size = min(max_workspace_size, sms)
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                device=device,
+                                requires_grad=False)
 
-    has_zero_point = w_zeros is not None
-    if w_zeros is None:
-        w_zeros = torch.empty((0, 0),
-                              dtype=hidden_states.dtype,
-                              device=hidden_states.device,
-                              requires_grad=False)
+    scalar_type = get_scalar_type(num_bits, w_zeros is not None)
+    intermediate_cache = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
 
-    if g_idx is None:
-        g_idx = torch.empty((0, 0),
-                            dtype=torch.int32,
-                            device=hidden_states.device,
-                            requires_grad=False)
-
-    if sort_indices is None:
-        sort_indices = torch.empty((0),
-                                   dtype=torch.int32,
-                                   device=hidden_states.device,
-                                   requires_grad=False)
-
-    scalar_type = get_scalar_type(num_bits, has_zero_point)
-
-    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        w_zeros, g_idx, sort_indices, workspace, scalar_type.id, M, N, K,
-        is_k_full, E, topk, block_size_m, True, False)
+    ops.moe_wna16_marlin_gemm(hidden_states,
+                              intermediate_cache,
+                              w,
+                              scales,
+                              w_zeros,
+                              g_idx,
+                              sort_indices,
+                              workspace,
+                              sorted_token_ids,
+                              expert_ids,
+                              num_tokens_post_padded,
+                              topk_weights,
+                              moe_block_size=block_size_m,
+                              top_k=topk,
+                              mul_topk_weights=False,
+                              is_ep=expert_map is not None,
+                              b_q_type=scalar_type,
+                              size_m=M,
+                              size_n=N,
+                              size_k=K,
+                              is_k_full=is_k_full,
+                              use_atomic_add=False,
+                              use_fp32_reduce=True,
+                              is_zp_float=False)
+    intermediate_cache = intermediate_cache.view(-1, topk, N)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -127,9 +144,12 @@ def single_marlin_moe_fake(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
     g_idx: Optional[torch.Tensor] = None,
     sort_indices: Optional[torch.Tensor] = None,
     w_zeros: Optional[torch.Tensor] = None,
+    workspace: Optional[torch.Tensor] = None,
     num_bits: int = 8,
     is_k_full: bool = True,
 ) -> torch.Tensor:
@@ -144,24 +164,26 @@ direct_register_custom_op(
 )
 
 
-def fused_marlin_moe(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
-    is_k_full: bool = True,
-) -> torch.Tensor:
+def fused_marlin_moe(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     w1_scale: torch.Tensor,
+                     w2_scale: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     topk_weights: torch.Tensor,
+                     topk_ids: torch.Tensor,
+                     global_num_experts: int = -1,
+                     expert_map: Optional[torch.Tensor] = None,
+                     g_idx1: Optional[torch.Tensor] = None,
+                     g_idx2: Optional[torch.Tensor] = None,
+                     sort_indices1: Optional[torch.Tensor] = None,
+                     sort_indices2: Optional[torch.Tensor] = None,
+                     w1_zeros: Optional[torch.Tensor] = None,
+                     w2_zeros: Optional[torch.Tensor] = None,
+                     workspace: Optional[torch.Tensor] = None,
+                     num_bits: int = 8,
+                     is_k_full: bool = True,
+                     inplace: bool = False) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
     weights, w1 and w2, and top-k gating mechanism.
@@ -196,27 +218,12 @@ def fused_marlin_moe(
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
     assert hidden_states.shape[1] == w2.shape[2] // (
         num_bits // 2), "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype == torch.float16
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
 
-    has_no_act_order = (g_idx1 is None and g_idx2 is None
-                        and sort_indices1 is None and sort_indices2 is None)
-    has_all_act_order = (g_idx1 is not None and g_idx2 is not None
-                         and sort_indices1 is not None
-                         and sort_indices2 is not None)
-    assert has_no_act_order or has_all_act_order, (
-        "g_idx and sorted_indices "
-        "must be all not None or must be all None")
-
-    has_no_zp = w1_zeros is None and w2_zeros is None
-    has_all_zp = w1_zeros is not None and w2_zeros is not None
-    assert has_no_zp or has_all_zp, ("zero points must be both not None or "
-                                     "must be both None")
-
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
@@ -234,122 +241,128 @@ def fused_marlin_moe(
 
     block_size_m = config["BLOCK_SIZE_M"]
 
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = \
+        moe_align_block_size(topk_ids, block_size_m, global_num_experts,
+                             expert_map)
 
-    max_workspace_size = (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device=current_platform.device_type,
-                            requires_grad=False)
+    if workspace is None:
+        max_workspace_size = (max(2 * N, K) // 64) * \
+            (sorted_token_ids.size(0) // block_size_m)
+        device = hidden_states.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        max_workspace_size = min(max_workspace_size, sms * 4)
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                device=device,
+                                requires_grad=False)
 
-    if has_no_zp:
-        w1_zeros = torch.empty((0, 0),
-                               dtype=hidden_states.dtype,
-                               device=hidden_states.device,
-                               requires_grad=False)
-        w2_zeros = torch.empty((0, 0),
-                               dtype=hidden_states.dtype,
-                               device=hidden_states.device,
-                               requires_grad=False)
-
-    if has_no_act_order:
-        g_idx1 = torch.empty((0, 0),
-                             dtype=torch.int32,
-                             device=hidden_states.device,
-                             requires_grad=False)
-        g_idx2 = torch.empty((0, 0),
-                             dtype=torch.int32,
-                             device=hidden_states.device,
-                             requires_grad=False)
-        sort_indices1 = torch.empty((0),
-                                    dtype=torch.int32,
-                                    device=hidden_states.device,
-                                    requires_grad=False)
-        sort_indices2 = torch.empty((0, 0),
-                                    dtype=torch.int32,
-                                    device=hidden_states.device,
-                                    requires_grad=False)
-
-    scalar_type1 = get_scalar_type(num_bits, has_all_zp)
-    scalar_type2 = get_scalar_type(num_bits, has_all_zp)
+    scalar_type1 = get_scalar_type(num_bits, w1_zeros is not None)
+    scalar_type2 = get_scalar_type(num_bits, w2_zeros is not None)
 
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache13 = torch.empty(
+        (M * topk_ids.shape[1] * max(2 * N, K), ),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = intermediate_cache13[:M * topk_ids.shape[1] * 2 * N]
+    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+    intermediate_cache3 = intermediate_cache13[:M * topk_ids.shape[1] * K]
+    intermediate_cache3 = intermediate_cache3.view(-1, K)
 
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+    use_atomic_add = hidden_states.dtype == torch.half or \
+        torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+
+    intermediate_cache1 = ops.moe_wna16_marlin_gemm(
         hidden_states,
+        intermediate_cache1,
         w1,
-        sorted_token_ids,
-        topk_weights,
-        topk_ids,
         w1_scale,
         w1_zeros,
         g_idx1,
         sort_indices1,
         workspace,
-        scalar_type1.id,
-        M,
-        2 * N,
-        K,
-        is_k_full,
-        E,
-        topk,
-        block_size_m,
-        True,
-        False,
-    )
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=topk,
+        mul_topk_weights=False,
+        is_ep=expert_map is not None,
+        b_q_type=scalar_type1,
+        size_m=M,
+        size_n=2 * N,
+        size_k=K,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False)
 
     torch.ops._C.silu_and_mul(intermediate_cache2,
                               intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+    if expert_map is not None:
+        intermediate_cache3.zero_()
+
+    intermediate_cache3 = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
+        intermediate_cache3,
         w2,
-        sorted_token_ids,
-        topk_weights,
-        topk_ids,
         w2_scale,
         w2_zeros,
         g_idx2,
         sort_indices2,
         workspace,
-        scalar_type2.id,
-        M,
-        K,
-        N,
-        is_k_full,
-        E,
-        topk,
-        block_size_m,
-        False,
-        True,
-    )
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=1,
+        mul_topk_weights=True,
+        is_ep=expert_map is not None,
+        b_q_type=scalar_type2,
+        size_m=M * topk,
+        size_n=K,
+        size_k=N,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False).view(-1, topk, K)
 
+    output = hidden_states if inplace else torch.empty_like(hidden_states)
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+                     dim=1,
+                     out=output)
 
 
-def fused_marlin_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
-    is_k_full: bool = True,
-) -> torch.Tensor:
+def fused_marlin_moe_fake(hidden_states: torch.Tensor,
+                          w1: torch.Tensor,
+                          w2: torch.Tensor,
+                          w1_scale: torch.Tensor,
+                          w2_scale: torch.Tensor,
+                          gating_output: torch.Tensor,
+                          topk_weights: torch.Tensor,
+                          topk_ids: torch.Tensor,
+                          global_num_experts: int = -1,
+                          expert_map: Optional[torch.Tensor] = None,
+                          g_idx1: Optional[torch.Tensor] = None,
+                          g_idx2: Optional[torch.Tensor] = None,
+                          sort_indices1: Optional[torch.Tensor] = None,
+                          sort_indices2: Optional[torch.Tensor] = None,
+                          w1_zeros: Optional[torch.Tensor] = None,
+                          w2_zeros: Optional[torch.Tensor] = None,
+                          workspace: Optional[torch.Tensor] = None,
+                          num_bits: int = 8,
+                          is_k_full: bool = True,
+                          inplace: bool = False) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4de020ff81c0e..a209715ede77c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,11 +12,19 @@ import triton.language as tl
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8, per_token_quant_int8)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
+
 logger = init_logger(__name__)
 
 
@@ -244,50 +252,53 @@ def fused_moe_kernel_gptq_awq(
 
 @triton.jit
 def fused_moe_kernel(
-        # Pointers to matrices
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        a_scale_ptr,
-        b_scale_ptr,
-        topk_weights_ptr,
-        sorted_token_ids_ptr,
-        expert_ids_ptr,
-        num_tokens_post_padded_ptr,
-        # Matrix dimensions
-        N,
-        K,
-        EM,
-        num_valid_tokens,
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_am,
-        stride_ak,
-        stride_be,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_asm,
-        stride_ask,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        # Block size for block-wise quantization
-        group_n: tl.constexpr,
-        group_k: tl.constexpr,
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-        MUL_ROUTED_WEIGHT: tl.constexpr,
-        top_k: tl.constexpr,
-        compute_type: tl.constexpr,
-        use_fp8_w8a8: tl.constexpr,
-        use_int8_w8a16: tl.constexpr):
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_channel_quant: tl.constexpr,
+):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
     token and expert matrices.
@@ -364,12 +375,23 @@ def fused_moe_kernel(
             None, :] * stride_bsn
         b_scale = tl.load(b_scale_ptrs)
 
-    if use_fp8_w8a8:
+    if use_fp8_w8a8 or use_int8_w8a8:
+        # block-wise
         if group_k > 0 and group_n > 0:
             a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
             offs_bsn = offs_bn // group_n
             b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
                             offs_bsn * stride_bsn)
+        # channel-wise
+        elif per_channel_quant:
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+                None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:,
+                                                                        None]
+        # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
             b_scale = tl.load(b_scale_ptr + off_experts)
@@ -393,7 +415,7 @@ def fused_moe_kernel(
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
-        elif use_fp8_w8a8:
+        elif use_fp8_w8a8 or use_int8_w8a8:
             if group_k > 0 and group_n > 0:
                 k_start = k * BLOCK_SIZE_K
                 offs_ks = k_start // group_k
@@ -405,7 +427,11 @@ def fused_moe_kernel(
                 accumulator += tl.dot(a, b) * a_scale[:,
                                                       None] * b_scale[None, :]
             else:
-                accumulator = tl.dot(a, b, acc=accumulator)
+                if use_fp8_w8a8:
+                    # acc used to enable fp8_fast_accum
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
@@ -419,7 +445,7 @@ def fused_moe_kernel(
         accumulator = accumulator * moe_weight[:, None]
     if use_int8_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
-    elif use_fp8_w8a8:
+    elif use_fp8_w8a8 or use_int8_w8a8:
         if group_k > 0 and group_n > 0:
             accumulator = accumulator.to(compute_type)
         else:
@@ -435,242 +461,13 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    start_idx = pid * tokens_per_thread
-
-    off_c = (pid + 1) * num_experts
-
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-
-
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-
-
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-
-
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
-                                         numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-
-
-# Triton implementation based on:
-# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts, )
-    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
-                              dtype=torch.int32,
-                              device=topk_ids.device)
-    cumsum = torch.zeros((num_experts + 1, ),
-                         dtype=torch.int32,
-                         device=topk_ids.device)
-    tokens_per_thread = ceil_div(numel, num_experts)
-
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1, )](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    block_size: int,
-    num_experts: int,
-    expert_map: torch.Tensor = None
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Aligns the token distribution across experts to be compatible with block
-    size for matrix multiplication.
-
-    Parameters:
-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
-        top-k expert indices for each token.
-    - block_size: The block size used in block matrix multiplication.
-    - num_experts: The total number of experts.
-    - expert_map: A tensor of shape [num_experts] that maps the expert index
-        from the global space to the local index space of the current
-        expert parallel shard. If the expert is not in the current expert
-        parallel shard, the mapping is set to -1.
-
-    Returns:
-    - sorted_token_ids: A tensor containing the sorted token indices according
-        to their allocated expert.
-    - expert_ids: A tensor indicating the assigned expert index for each block.
-    - num_tokens_post_padded: The total number of tokens after padding,
-        ensuring divisibility by block_size.
-
-    This function pads the number of tokens that each expert needs to process
-    so that it is divisible by block_size.
-    Padding ensures that during block matrix multiplication, the dimensions
-    align correctly.
-
-    Example:
-    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
-    block_size = 4, and num_experts = 4:
-    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
-        with each expert needing to process 3 tokens.
-    - As block_size is 4, we pad 1 token for each expert.
-    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
-    - Then append padding tokens [12, 12, 12, 12] for each block.
-    - After sorting by expert index, we obtain token_ids
-        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
-        Tokens 12 are non-existent (padding) and are ignored in
-        the subsequent matrix multiplication.
-    - The padding ensures that the total number of tokens is now divisible
-        by block_size for proper block matrix operations.
-    """
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    sorted_ids.fill_(topk_ids.numel())
-    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    # Expert ids must be zeroed out to prevent index out of bounds error while
-    # mapping global expert ids to local expert ids in expert parallelism.
-    expert_ids = torch.zeros((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-    if num_experts >= 224:
-        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
-            moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-        else:
-            # Currently requires num_experts=256
-            ops.sgl_moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-    else:
-        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                                 expert_ids, num_tokens_post_pad)
-    if expert_map is not None:
-        expert_ids = expert_map[expert_ids]
-
-    return sorted_ids, expert_ids, num_tokens_post_pad
-
-
 def invoke_fused_moe_kernel(A: torch.Tensor,
                             B: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
                             B_zp: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            topk_weights: Optional[torch.Tensor],
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -679,29 +476,17 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             config: Dict[str, Any],
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
+                            use_int8_w8a8: bool,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
+                            per_channel_quant: bool,
                             block_shape: Optional[List[int]] = None) -> None:
-    assert topk_weights.stride(1) == 1
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if use_fp8_w8a8:
-        assert B_scale is not None
-        if block_shape is None:
-            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
-        else:
-            assert len(block_shape) == 2
-            block_n, block_k = block_shape[0], block_shape[1]
-            A, A_scale = per_token_group_quant_fp8(A, block_k)
-            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
-            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
-            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
-    elif use_int8_w8a16 or use_int4_w4a16:
-        assert B_scale is not None
-        assert block_shape is None or block_shape[0] == 0
-    else:
-        assert A_scale is None
-        assert B_scale is None
+    M = A.shape[0]
+    num_tokens = M * top_k
 
     EM = sorted_token_ids.shape[0]
     if A.shape[0] < config["BLOCK_SIZE_M"]:
@@ -720,7 +505,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert B_zp is None or B_zp.ndim == 3
 
         use_moe_wna16_cuda = should_moe_wna16_use_cuda(
-            num_valid_tokens=topk_ids.numel(),
+            num_valid_tokens=num_tokens,
             group_size=block_shape[1],
             num_experts=B.shape[0],
             bit=4 if use_int4_w4a16 else 8)
@@ -728,12 +513,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         config.update(
             get_moe_wna16_block_config(config=config,
                                        use_moe_wna16_cuda=use_moe_wna16_cuda,
-                                       num_valid_tokens=topk_ids.numel(),
+                                       num_valid_tokens=num_tokens,
                                        size_k=A.shape[1],
                                        size_n=B.shape[1],
                                        num_experts=B.shape[1],
                                        group_size=block_shape[1],
-                                       real_top_k=topk_ids.shape[1],
+                                       real_top_k=top_k,
                                        block_size_m=config["BLOCK_SIZE_M"]))
 
         if use_moe_wna16_cuda:
@@ -759,7 +544,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             A.shape[1],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -802,7 +587,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             B.shape[2],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -826,7 +611,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             top_k=top_k,
             compute_type=compute_type,
             use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
+            per_channel_quant=per_channel_quant,
             BLOCK_SIZE_K=BLOCK_SIZE_K,
             **config,
         )
@@ -984,6 +771,18 @@ def get_default_config(
             config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
         else:
             config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
+    elif is_marlin:
+        for block_size_m in [8, 16, 32, 48, 64]:
+            if M * topk / E / block_size_m < 0.9:
+                break
+        return {"BLOCK_SIZE_M": block_size_m}
+    elif M <= E:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
     else:
         config = {
             "BLOCK_SIZE_M": 64,
@@ -991,14 +790,6 @@ def get_default_config(
             "BLOCK_SIZE_K": 32,
             "GROUP_SIZE_M": 8,
         }
-        # A heuristic: fused marlin works faster with this config for small M
-        if M <= E or (is_marlin and M <= 32):
-            config = {
-                "BLOCK_SIZE_M": 16,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 64,
-                "GROUP_SIZE_M": 1,
-            }
     return config
 
 
@@ -1035,12 +826,35 @@ def try_get_optimal_moe_config(
     return config
 
 
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor,
+                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
+    if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
+        return rocm_aiter_topk_softmax
+    return vllm_topk_softmax
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-):
+) -> Tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
@@ -1059,30 +873,29 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
+
+    topk_func = dispatch_topk_func()
+    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
+                                       token_expert_indicies,
+                                       gating_output_float, renormalize)
+
     del token_expert_indicies  # Not used. Will be used in the future.
-
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-
     return topk_weights, topk_ids
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
-def grouped_topk(hidden_states: torch.Tensor,
-                 gating_output: torch.Tensor,
-                 topk: int,
-                 renormalize: bool,
-                 num_expert_group: int = 0,
-                 topk_group: int = 0,
-                 scoring_func: str = "softmax",
-                 e_score_correction_bias: Optional[torch.Tensor] = None):
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -1131,10 +944,11 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
-def get_config_dtype_str(dtype: torch.dtype,
-                         use_int4_w4a16: Optional[bool] = False,
-                         use_int8_w8a16: Optional[bool] = False,
-                         use_fp8_w8a8: Optional[bool] = False):
+def get_config_dtype_str(
+        dtype: torch.dtype,
+        use_int4_w4a16: Optional[bool] = False,
+        use_int8_w8a16: Optional[bool] = False,
+        use_fp8_w8a8: Optional[bool] = False) -> Optional[str]:
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
@@ -1154,9 +968,12 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
                           activation: str = "silu",
+                          apply_router_weight_on_input: bool = False,
                           use_fp8_w8a8: bool = False,
+                          use_int8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
                           use_int4_w4a16: bool = False,
+                          per_channel_quant: bool = False,
                           global_num_experts: int = -1,
                           expert_map: Optional[torch.Tensor] = None,
                           w1_scale: Optional[torch.Tensor] = None,
@@ -1167,8 +984,9 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       activation, use_fp8_w8a8, use_int8_w8a16,
-                       use_int4_w4a16, global_num_experts, expert_map,
+                       activation, apply_router_weight_on_input, use_fp8_w8a8,
+                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                       per_channel_quant, global_num_experts, expert_map,
                        w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
                        block_shape)
 
@@ -1180,9 +998,12 @@ def inplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1200,6 +1021,7 @@ direct_register_custom_op(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
@@ -1210,9 +1032,12 @@ def outplace_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1223,10 +1048,12 @@ def outplace_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, activation, use_fp8_w8a8, use_int8_w8a16,
-                              use_int4_w4a16, global_num_experts, expert_map,
-                              w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                              a2_scale, block_shape)
+                              False, activation, apply_router_weight_on_input,
+                              use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, per_channel_quant,
+                              global_num_experts, expert_map, w1_scale,
+                              w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+                              block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -1237,8 +1064,10 @@ def outplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         activation: str = "silu",
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1256,9 +1085,29 @@ direct_register_custom_op(
     op_func=outplace_fused_experts,
     mutates_args=[],
     fake_impl=outplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
+def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+    torch.ops.vllm.inplace_fused_experts(**kwargs)
+    hidden_states = kwargs['hidden_states']
+    return hidden_states
+
+
+def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+    return torch.ops.vllm.outplace_fused_experts(**kwargs)
+
+
+def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
+    if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+        return rocm_aiter_fused_experts
+    if inplace:
+        return torch_vllm_inplace_fused_experts
+    return torch_vllm_outplace_fused_experts
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -1266,9 +1115,12 @@ def fused_experts(hidden_states: torch.Tensor,
                   topk_ids: torch.Tensor,
                   inplace: bool = False,
                   activation: str = "silu",
+                  apply_router_weight_on_input: bool = False,
                   use_fp8_w8a8: bool = False,
+                  use_int8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
+                  per_channel_quant: bool = False,
                   global_num_experts: int = -1,
                   expert_map: Optional[torch.Tensor] = None,
                   w1_scale: Optional[torch.Tensor] = None,
@@ -1277,21 +1129,102 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None) -> torch.Tensor:
-
-    if inplace:
-        torch.ops.vllm.inplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
-        return hidden_states
+                  block_shape: Optional[List[int]] = None,
+                  allow_deep_gemm: bool = False) -> torch.Tensor:
+    if (allow_deep_gemm and use_fp8_w8a8
+            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+        assert apply_router_weight_on_input is False
+        return deep_gemm_moe_fp8(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
     else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
+        return dispatch_fused_experts_func(inplace)(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape)
+
+
+def moe_kernel_prepare_input(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    per_channel_quant: bool,
+    block_shape: Optional[List[int]] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if use_fp8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # If weights are per-channel (per_channel_quant=True), then
+            # activations apply per-token quantization. Otherwise, assume
+            # activation tensor-wise fp8 quantization, dynamic or static
+            A, A_scale = ops.scaled_fp8_quant(
+                A, A_scale, use_per_token_if_dynamic=per_channel_quant)
+        else:
+            # activation block-wise fp8 quantization
+            assert len(block_shape) == 2
+            _, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_fp8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # activation channel-wise int8 quantization
+            assert (per_channel_quant
+                    ), "int8 quantization only supports block or channel-wise"
+            A, A_scale = per_token_quant_int8(A)
+        else:
+            # activation block-wise int8 quantization
+            assert len(block_shape) == 2
+            _, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_int8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a16 or use_int4_w4a16:
+        assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
+    else:
+        assert A_scale is None
+        assert B_scale is None
+
+    return A, A_scale
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -1301,9 +1234,12 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        topk_ids: torch.Tensor,
                        inplace: bool = False,
                        activation: str = "silu",
+                       apply_router_weight_on_input: bool = False,
                        use_fp8_w8a8: bool = False,
+                       use_int8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
                        use_int4_w4a16: bool = False,
+                       per_channel_quant: bool = False,
                        global_num_experts: int = -1,
                        expert_map: Optional[torch.Tensor] = None,
                        w1_scale: Optional[torch.Tensor] = None,
@@ -1330,6 +1266,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    K = w2.shape[1]
     if global_num_experts == -1:
         global_num_experts = E
     top_k_num = topk_ids.shape[1]
@@ -1355,13 +1292,11 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     # We can reuse the memory between these because by the time we need
     # cache3, we're done with cache1
-    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+    cache13 = torch.empty(M * top_k_num * max(N, K),
                           device=hidden_states.device,
                           dtype=hidden_states.dtype)
-    intermediate_cache1 = cache13[:M * top_k_num * N].view(
-        (M, topk_ids.shape[1], N))
-    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
-        (M, topk_ids.shape[1], w2.shape[1]))
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[:M * top_k_num * K].view(M, top_k_num, K)
 
     # This needs separate memory since it's used concurrently with cache1
     intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
@@ -1406,28 +1341,41 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
 
+        qcurr_hidden_states, qa1_scale = moe_kernel_prepare_input(
+            A=curr_hidden_states,
+            B=w1,
+            A_scale=a1_scale,
+            B_scale=w1_scale,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape)
+
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
             moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
                                  global_num_experts, expert_map))
 
-        invoke_fused_moe_kernel(curr_hidden_states,
+        invoke_fused_moe_kernel(qcurr_hidden_states,
                                 w1,
                                 intermediate_cache1,
-                                a1_scale,
+                                qa1_scale,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
-                                curr_topk_ids,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
-                                False,
+                                apply_router_weight_on_input,
                                 top_k_num,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
         if activation == "silu":
@@ -1439,28 +1387,42 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
-        invoke_fused_moe_kernel(intermediate_cache2,
+        qintermediate_cache2, qa2_scale = moe_kernel_prepare_input(
+            A=intermediate_cache2,
+            B=w2,
+            A_scale=a2_scale,
+            B_scale=w2_scale,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape)
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
                                 w2,
                                 intermediate_cache3,
-                                a2_scale,
+                                qa2_scale,
                                 w2_scale,
                                 w2_zp,
                                 curr_topk_weights,
-                                curr_topk_ids,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
-                                True,
+                                not apply_router_weight_on_input,
                                 1,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
     return out_hidden_states
 
 
@@ -1478,8 +1440,10 @@ def fused_moe(
     topk_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
     global_num_experts: int = -1,
     expert_map: Optional[torch.Tensor] = None,
     w1_scale: Optional[torch.Tensor] = None,
@@ -1512,6 +1476,8 @@ def fused_moe(
         note: Deepseekv2 model uses grouped_topk
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
+    - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
     - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
         activation to compute the inner products for w1 and w2.
         Defaults to False.
@@ -1558,8 +1524,10 @@ def fused_moe(
                          inplace=inplace,
                          activation=activation,
                          use_fp8_w8a8=use_fp8_w8a8,
+                         use_int8_w8a8=use_int8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
                          use_int4_w4a16=use_int4_w4a16,
+                         per_channel_quant=per_channel_quant,
                          global_num_experts=global_num_experts,
                          expert_map=expert_map,
                          w1_scale=w1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bc134f676159e..43fb311289fd4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
-from vllm import envs
+import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -65,7 +65,9 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -117,6 +119,18 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
             layer.w2_weight.data),
                                              requires_grad=False)
+        # Lazy import to avoid importing triton.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled, shuffle_weights)
+        if is_rocm_aiter_moe_enabled():
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
 
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
@@ -144,22 +158,25 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        return self.forward(x=x,
-                            layer=layer,
-                            router_logits=router_logits,
-                            top_k=top_k,
-                            renormalize=renormalize,
-                            use_grouped_topk=use_grouped_topk,
-                            topk_group=topk_group,
-                            num_expert_group=num_expert_group,
-                            global_num_experts=global_num_experts,
-                            expert_map=expert_map,
-                            custom_routing_function=custom_routing_function,
-                            scoring_func=scoring_func,
-                            e_score_correction_bias=e_score_correction_bias,
-                            activation=activation)
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input)
 
     def forward_cuda(
         self,
@@ -176,6 +193,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -190,15 +208,17 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(hidden_states=x,
-                             w1=layer.w13_weight,
-                             w2=layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map)
+        return fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map)
 
     def forward_cpu(
         self,
@@ -216,9 +236,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         **kwargs,
     ):
         assert activation == "silu", f"{activation} is not supported."
+        assert apply_router_weight_on_input is False
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -242,15 +264,20 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
         assert custom_routing_function is None
         assert layer is not None
+        assert apply_router_weight_on_input is False
         if scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for HPU.")
@@ -275,12 +302,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
         assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
         if scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for TPU.")
@@ -297,7 +326,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_cuda
+    forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
 
 
 def determine_expert_map(
@@ -386,12 +415,14 @@ class FusedMoE(torch.nn.Module):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ):
         super().__init__()
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
 
         # Note: here we guard against accessing the TP and DP groups when
         # uninitialized (this happens when testing)
@@ -407,7 +438,7 @@ class FusedMoE(torch.nn.Module):
         # Use expert parallelism instead of tensor parallelism?
         vllm_config = get_current_vllm_config()
         use_ep = (vllm_config.parallel_config.enable_expert_parallel
-                  and self.tp_size > 1)
+                  and self.tp_size * self.dp_size > 1)
 
         # For smuggling this layer into the fused moe custom op
         self.use_direct_call = self.dp_size == 1
@@ -442,6 +473,7 @@ class FusedMoE(torch.nn.Module):
         self.global_num_experts = num_experts
 
         assert intermediate_size % self.tp_size == 0
+        self.hidden_size = hidden_size
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
         self.renormalize = renormalize
@@ -460,7 +492,7 @@ class FusedMoE(torch.nn.Module):
                              "non-grouped topk.")
         if current_platform.is_hpu():
             from vllm_hpu_extension.ops import DynamicFusedMOE
-            self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
+            self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
@@ -471,6 +503,7 @@ class FusedMoE(torch.nn.Module):
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
 
+        self.apply_router_weight_on_input = apply_router_weight_on_input
         moe_quant_params = {
             "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
@@ -481,7 +514,9 @@ class FusedMoE(torch.nn.Module):
         }
         # need full intermediate size pre-sharding for WNA16 act order
         if (self.quant_method.__class__.__name__
-                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
+                in ("GPTQMarlinMoEMethod",
+                    "CompressedTensorsWNA16MarlinMoEMethod",
+                    "CompressedTensorsWNA16MoEMethod")):
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
@@ -617,9 +652,10 @@ class FusedMoE(torch.nn.Module):
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
-        loaded_weight = loaded_weight.t().contiguous() if (
-            self.quant_method.__class__.__name__
-            == "CompressedTensorsWNA16MoEMethod") else loaded_weight
+        if self.quant_method.__class__.__name__ in (
+                "CompressedTensorsWNA16MarlinMoEMethod",
+                "CompressedTensorsWNA16MoEMethod"):
+            loaded_weight = loaded_weight.t().contiguous()
 
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
@@ -687,8 +723,9 @@ class FusedMoE(torch.nn.Module):
                              tp_rank=self.tp_rank)
             return
 
-        # Case weight scales and zero_points
-        if ("scale" in weight_name or "zero" in weight_name):
+        # Case weight scales, zero_points and offset
+        if ("scale" in weight_name or "zero" in weight_name
+                or "offset" in weight_name):
             # load the weight scales and zp based on the quantization scheme
             # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
@@ -837,6 +874,7 @@ class FusedMoE(torch.nn.Module):
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             activation=self.activation,
+            apply_router_weight_on_input=self.apply_router_weight_on_input,
         )
 
         if self.dp_size > 1:
@@ -872,32 +910,6 @@ class FusedMoE(torch.nn.Module):
             ]
         ]
 
-    def _load_fp8_scale(self, param: torch.nn.Parameter,
-                        loaded_weight: torch.Tensor, weight_name: str,
-                        shard_id: str, expert_id: int) -> None:
-        param_data = param.data
-
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            # If we are in merged column case (gate_up_proj)
-            if shard_id in ("w1", "w3"):
-                # We have to keep the weight scales of w1 and w3 because
-                # we need to re-quantize w1/w3 weights after weight loading.
-                idx = 0 if shard_id == "w1" else 1
-                param_data[expert_id][idx] = loaded_weight
-            # If we are in the row parallel case (down_proj)
-            else:
-                param_data[expert_id] = loaded_weight
-
     def extra_repr(self) -> str:
 
         s = (
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
new file mode 100644
index 0000000000000..07d51acf98674
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.utils import round_up
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    - expert_map: A tensor of shape [num_experts] that maps the expert index
+        from the global space to the local index space of the current
+        expert parallel shard. If the expert is not in the current expert
+        parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    # Expert ids must be zeroed out to prevent index out of bounds error while
+    # mapping global expert ids to local expert ids in expert parallelism.
+    expert_ids = torch.zeros((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+    if num_experts >= 224:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            # Currently requires num_experts=256
+            ops.sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                 expert_ids, num_tokens_post_pad)
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
new file mode 100644
index 0000000000000..1315dcead49d0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+
+
+def is_rocm_aiter_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_MOE \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+def rocm_aiter_asm_moe_tkw1(hidden_states,
+                            w1,
+                            w2,
+                            topk_weight,
+                            topk_ids,
+                            fc1_scale=None,
+                            fc2_scale=None,
+                            fc1_smooth_scale=None,
+                            fc2_smooth_scale=None,
+                            a16=False,
+                            per_tensor_quant_scale=None,
+                            expert_mask=None,
+                            activation_str: str = "silu") -> None:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = \
+        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+
+    return asm_moe_tkw1(hidden_states,
+                        w1,
+                        w2,
+                        topk_weight,
+                        topk_ids,
+                        fc1_scale=fc1_scale,
+                        fc2_scale=fc2_scale,
+                        fc1_smooth_scale=fc1_smooth_scale,
+                        fc2_smooth_scale=fc2_smooth_scale,
+                        a16=a16,
+                        per_tensor_quant_scale=per_tensor_quant_scale,
+                        expert_mask=expert_mask,
+                        activation=activation)
+
+
+def rocm_aiter_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    allow_deep_gemm: bool = False,
+) -> torch.Tensor:
+
+    import aiter as rocm_aiter
+    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+
+    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+        per_token_group_quant_fp8)
+
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+
+    if (block_shape is not None) and use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for block scaled moe"
+        )
+
+        assert w1_scale is not None
+        assert w2_scale is not None
+
+        local_E = E = w1.shape[0]
+        if expert_map is not None:
+            E = expert_map.numel()
+
+        topk = topk_ids.shape[1]
+        model_dim = w1.shape[-1]
+        dtype = hidden_states.dtype
+        # The default block sizes are 128 in AITER.
+        if block_shape is None:
+            block_shape = [128, 128]
+
+        scale_blk_k = block_shape[1]
+
+        (
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            out_asm,
+        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
+                                               topk_weights,
+                                               E,
+                                               model_dim,
+                                               dtype,
+                                               expert_mask=expert_map)
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
+        rocm_aiter.fmoe_fp8_blockscale_g1u1(
+            out_asm,
+            a1,
+            w1,
+            w2,
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            topk,
+            w1_scale.view(local_E, -1),
+            w2_scale.view(local_E, -1),
+            a1_scale.t().contiguous(),
+            block_shape[0],
+            block_shape[1],
+            None,
+        )
+        return out_asm
+
+    elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when"
+            " `apply_router_weight_on_input` is True")
+
+        return rocm_aiter_asm_moe_tkw1(hidden_states,
+                                       w1,
+                                       w2,
+                                       topk_weights,
+                                       topk_ids,
+                                       fc1_scale=w1_scale,
+                                       fc2_scale=w2_scale,
+                                       fc1_smooth_scale=None,
+                                       fc2_smooth_scale=None,
+                                       a16=False,
+                                       per_tensor_quant_scale=None,
+                                       expert_mask=expert_map,
+                                       activation_str=activation)
+
+    elif use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for fp8_w8a8")
+        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                           w1=w1,
+                                           w2=w2,
+                                           topk_weight=topk_weights,
+                                           topk_ids=topk_ids,
+                                           fc1_scale=w1_scale,
+                                           fc2_scale=w2_scale,
+                                           fc1_smooth_scale=None,
+                                           fc2_smooth_scale=None,
+                                           a16=False)
+
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
+    return rocm_aiter.ck_moe(hidden_states=hidden_states,
+                             w1=w1,
+                             w2=w2,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids)
+
+
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
+    import aiter as rocm_aiter
+    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                            gating_output, renormalize)
+
+    return topk_weights, topk_indices
+
+
+def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+    """
+    Applies shuffle_weight function from AITER to each 
+    input tensor and returns them.
+
+    Args:
+    *tensors: Variable number of torch.Tensor objects.
+
+    Returns:
+    A tuple of shuffled tensors.
+    """
+    from aiter.ops.shuffle import shuffle_weight
+
+    return tuple(shuffle_weight(tensor) for tensor in tensors)
+
+
+def expand_weights(*tensors: torch.Tensor,
+                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
+    """
+    Expands the dimensions of input tensors.
+
+    Args:
+        *tensors: A variable number of torch.Tensor objects.
+        expansion_dims: A list of expansion dimensions 
+        corresponding to each tensor.
+
+    Returns:
+        A tuple of tensors with expanded dimensions.
+    """
+
+    assert len(tensors) == len(expansion_dims), \
+    "Number of tensors must match the number of expansion dimensions."
+
+    return tuple(
+        tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
+        for tensor, dim in zip(tensors, expansion_dims))
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
new file mode 100644
index 0000000000000..db31422f7275b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+from math import prod
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.utils import cdiv
+
+
+def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel()
+    return x.flatten()[:prod(v)].view(*v)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 76d3acb92fb81..5e8eb6c54c89d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -92,6 +92,7 @@ class RMSNorm(CustomOp):
         eps: float = 1e-6,
         var_hidden_size: Optional[int] = None,
         has_weight: bool = True,
+        dtype: Optional[torch.dtype] = None,
     ) -> None:
         super().__init__()
 
@@ -100,8 +101,10 @@ class RMSNorm(CustomOp):
         self.variance_size_override = (None if var_hidden_size == hidden_size
                                        else var_hidden_size)
         self.has_weight = has_weight
-
-        self.weight = torch.ones(hidden_size)
+        if dtype is not None:
+            self.weight = torch.ones(hidden_size, dtype=dtype)
+        else:
+            self.weight = torch.ones(hidden_size)
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
new file mode 100644
index 0000000000000..de360778f28cd
--- /dev/null
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -0,0 +1,651 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+@triton.jit
+def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
+                     d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                     NUM_BLOCK, CBLOCK: tl.constexpr):
+    # This kernel computes the diagonal blocks of the attention matrix
+    # Each diagonal block represents attention
+    # where queries attend to keys in the same block
+    off = tl.program_id(0)
+    off_bh = off // NUM_BLOCK  # batch-head index
+    off_block = off % NUM_BLOCK  # block index within the sequence
+    off_cblock = tl.program_id(1)  # sub-block index within a block
+
+    off_h = off_bh % h  # head index
+
+    # Calculate base offsets for the current batch and head
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+
+    # Calculate offsets for the current block
+    block_offset = off_block * BLOCK
+    qk_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    o_block_offset = block_offset * e
+
+    # Calculate offsets for the current sub-block
+    cblock_offset = off_cblock * CBLOCK
+    q_cblock_offset = cblock_offset * d
+    o_cblock_offset = cblock_offset * e
+
+    # Calculate pointers to the query, key, value, and output tensors
+    Q_block_ptr = (Q + qk_offset + qk_block_offset + q_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    K_trans_block_ptr = (K + qk_offset + qk_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, d)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+    O_block_ptr = (Out + o_offset + o_block_offset + o_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    i = off_cblock
+    q_index = tl.arange(0, CBLOCK) + i * CBLOCK
+
+    # Load query values
+    q = tl.load(Q_block_ptr,
+                mask=block_offset + q_index[:, None] < n,
+                other=0.0).to(tl.float32)
+
+    # Initialize output accumulator
+    qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
+
+    # Process all sub-blocks up to and
+    # including the current one (causal attention)
+    for j in range(i + 1):
+        kv_index = tl.arange(0, CBLOCK) + j * CBLOCK
+        diff = q_index[:, None] - kv_index[None, :]
+        s_index = s * diff
+        # Apply causal mask: only attend to positions before the current one
+        s_index = tl.where(diff >= 0, -s_index, float("-inf"))
+        decay = tl.exp(s_index)
+
+        # Load key and value
+        k_trans = tl.load(
+            K_trans_block_ptr,
+            mask=block_offset + kv_index[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr,
+            mask=block_offset + kv_index[:, None] < n,
+            other=0.0,
+        ).to(tl.float32)
+
+        # Compute attention scores and apply decay
+        qk = tl.dot(q, k_trans) * decay
+
+        # Compute weighted values and accumulate
+        qkv += tl.dot(qk, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+
+    # Store the result
+    tl.store(
+        O_block_ptr,
+        qkv.to(O_block_ptr.dtype.element_ty),
+        mask=block_offset + q_index[:, None] < n,
+    )
+
+
+@triton.jit
+def _fwd_kv_parallel(
+    K,
+    V,
+    K_decay,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+    NUM_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the key-value outer
+    # products for each block in parallel
+    off_bh = tl.program_id(0)  # batch-head index
+    off_block = tl.program_id(1)  # block index
+
+    off_h = off_bh % h  # head index
+
+    block_offset = off_block * BLOCK
+
+    # Calculate offsets for the current block
+    k_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    kv_block_offset = off_block * d * e
+
+    # Calculate base offsets for the current batch and head
+    k_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointers to the key, value, and key-value tensors
+    K_trans_block_ptr = (K + k_offset + k_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, D_FBLOCK)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + kv_block_offset +
+                    tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay factors for the current head and block
+    k_decay_ptr = (K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :])
+
+    kv_index = tl.arange(0, CBLOCK)
+
+    # Initialize the key-value outer product accumulator
+    kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
+
+    # Handle the last block which might be smaller than BLOCK
+    if off_block == NUM_BLOCK - 1:
+        split_n = n - (NUM_BLOCK - 1) * BLOCK
+    else:
+        split_n = BLOCK
+    left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
+    num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
+    k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
+
+    # Process all sub-blocks in the current block
+    for j in range(num_blocks):
+        left_bound = (1 - j) * left_shift
+        # Load key and value, handling boundary conditions
+        k_trans = tl.load(K_trans_block_ptr - left_shift * d,
+                          mask=kv_index[None, :] >= left_bound,
+                          other=0.0)
+        v = tl.load(V_block_ptr - left_shift * e,
+                    mask=kv_index[:, None] >= left_bound,
+                    other=0.0)
+
+        # Load decay factor and compute weighted key-value outer product
+        k_decay = tl.load(k_decay_ptr)
+        kv += tl.dot(k_trans * k_decay, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+        k_decay_ptr += CBLOCK
+
+    # Store the result
+    tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty))
+
+
+@triton.jit
+def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
+                   d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                   NUM_BLOCK, D_FBLOCK: tl.constexpr, E_FBLOCK: tl.constexpr):
+    # This kernel reduces the key-value outer products
+    # across blocks and updates the KV history
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointer to the key-value tensor
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    s_ptrs = S + off_h
+    s = tl.load(s_ptrs)
+
+    # Calculate pointer to the key-value history tensor
+    kv_history_offset = off_bh * d * e
+    KV_HISTORY_block_ptr = (KV_HISTORY + kv_history_offset +
+                            tl.arange(0, D_FBLOCK)[:, None] * e +
+                            tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the previous key-value history
+    kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
+
+    # Process all blocks in reverse order to compute the prefix sum
+    for i in range(NUM_BLOCK):
+        block_size = min(n - i * BLOCK, BLOCK)
+        # Compute decay factor for the current block
+        block_decay = tl.exp(-s.to(tl.float32) * block_size)
+
+        # Load the current key-value outer product
+        kv_cur = tl.load(KV_block_ptr).to(tl.float32)
+        # Store the previous key-value history to the current block
+        tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty))
+
+        # Update the key-value history with the current block
+        kv_pre = block_decay * kv_pre + kv_cur
+        KV_block_ptr += d * e
+
+    # Store the updated key-value history
+    tl.store(KV_HISTORY_block_ptr, kv_pre)
+
+
+@triton.jit
+def _fwd_none_diag_kernel(
+    Q,
+    Out,
+    S,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    E_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the non-diagonal blocks of the attention matrix
+    # Each non-diagonal block represents attention
+    # where queries attend to keys in different blocks
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    off_nc = tl.program_id(1)
+    off_n = off_nc // NUM_CBLOCK  # block index
+    off_c = off_nc % NUM_CBLOCK  # sub-block index
+    off_e = tl.program_id(2)  # output feature block index
+
+    n_offset = off_n * BLOCK
+    c_offset = off_c * CBLOCK
+    e_offset = off_e * E_FBLOCK
+    block_offset = n_offset + c_offset
+
+    # Calculate offsets for the current batch, head, and block
+    q_offset = off_bh * n * d + (n_offset + c_offset) * d
+    o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset
+    kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
+
+    # Calculate pointers to the query, output, and key-value tensors
+    Q_block_ptr = (Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    O_block_ptr = (Out + o_offset + tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, d)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    c_array = tl.arange(0, CBLOCK)
+
+    # Load the key-value outer product for the current block
+    kv = tl.load(KV_block_ptr).to(tl.float32)
+    q_index = block_offset + tl.arange(0, CBLOCK)
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n,
+                other=0.).to(tl.float32)
+
+    # Compute decay factors for the current sub-block
+    q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
+
+    # Compute non-diagonal attention output
+    qkv_none_diag = tl.dot(q, kv) * q_decay
+
+    # Load diagonal attention output (computed by _fwd_diag_kernel)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n,
+                       other=0.).to(tl.float32)
+
+    # Combine diagonal and non-diagonal attention outputs
+    qkv = qkv_diag + qkv_none_diag
+
+    # Store the result
+    tl.store(O_block_ptr,
+             qkv.to(O_block_ptr.dtype.element_ty),
+             mask=q_index[:, None] < n)
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, s, kv_history):
+        # Forward pass of the lightning attention algorithm
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+
+        # Check CUDA compute capability
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError("Flash attention currently only supported",
+                               "for compute capability >= 80")
+
+        # Get input dimensions
+        b, h, n, d = q.shape
+        e = v.shape[-1]
+
+        # Initialize output tensor
+        o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+
+        # Set block sizes
+        BLOCK = 256
+        NUM_BLOCK = triton.cdiv(n, BLOCK)
+
+        CBLOCK = 32
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Compute decay factors for keys
+        array = torch.arange(0, BLOCK, device=q.device) + 1
+        k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1)))
+
+        # Step 1: Compute diagonal blocks of attention
+        grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
+        _fwd_diag_kernel[grid](q,
+                               k,
+                               v,
+                               o,
+                               s,
+                               b,
+                               h,
+                               n,
+                               d,
+                               e,
+                               BLOCK=BLOCK,
+                               NUM_BLOCK=NUM_BLOCK,
+                               CBLOCK=CBLOCK)
+
+        # Set feature block sizes
+        NUM_FBLOCK = 1
+        D_FBLOCK = d // NUM_FBLOCK
+        assert d % NUM_FBLOCK == 0
+        E_FBLOCK = e // NUM_FBLOCK
+        assert e % NUM_FBLOCK == 0
+
+        CBLOCK = 64
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Step 2: Compute key-value outer products for each block in parallel
+        kv = torch.empty((b, h, NUM_BLOCK, d, e),
+                         dtype=torch.float32,
+                         device=q.device)
+        grid = (b * h, NUM_BLOCK)
+        _fwd_kv_parallel[grid](
+            k,
+            v,
+            k_decay,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+            NUM_FBLOCK=NUM_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Step 3: Reduce key-value outer products
+        # across blocks and update KV history
+        grid = (b * h, NUM_FBLOCK)
+        _fwd_kv_reduce[grid](s,
+                             kv,
+                             kv_history,
+                             b,
+                             h,
+                             n,
+                             d,
+                             e,
+                             BLOCK=BLOCK,
+                             NUM_BLOCK=NUM_BLOCK,
+                             D_FBLOCK=D_FBLOCK,
+                             E_FBLOCK=E_FBLOCK)
+
+        # Step 4: Compute non-diagonal blocks of attention
+        grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
+        _fwd_none_diag_kernel[grid](
+            q,
+            o,
+            s,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            E_FBLOCK=E_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(q, k, v, s, kv)
+        ctx.BLOCK = BLOCK
+
+        return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2)
+
+
+# Apply the lightning attention function
+lightning_attention_ = _attention.apply
+
+
+def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+    """
+    Apply lightning attention algorithm 
+    to compute attention efficiently.
+    
+    Args:
+        q: Query tensor of shape [batch, heads, seq_len, dim]
+        k: Key tensor of shape [batch, heads, seq_len, dim]
+        v: Value tensor of shape [batch, heads, seq_len, dim_v]
+        ed: Decay rate tensor of shape [heads]
+        block_size: Size of blocks for block-sparse attention
+        kv_history: Optional key-value history from previous computations
+        
+    Returns:
+        output: Attention output
+        kv: Updated key-value history
+    """
+    d = q.shape[-1]
+    e = v.shape[-1]
+
+    if ed.dim() == 1:
+        ed = ed.view(1, -1, 1, 1)
+
+    # Split the computation into chunks for better parallelism
+    m = 128 if d >= 128 else 64
+    assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})"
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+
+    # Initialize or clone key-value history
+    if kv_history is None:
+        kv_history = torch.zeros((q.shape[0], q.shape[1], d, e),
+                                 dtype=torch.float32,
+                                 device=q.device)
+    else:
+        kv_history = kv_history.clone().contiguous()
+
+    # Process each chunk and accumulate results
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o, kv = lightning_attention_(q1, k1, v, ed, kv_history)
+        output = output + o
+    return output, kv
+
+
+@triton.jit
+def _linear_attn_decode_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    kv_cache_ptr,
+    slope_rate,
+    slot_idx,
+    output_ptr,
+    D: tl.constexpr,
+    qkv_b_stride,
+    qkv_h_stride,
+    cache_b_stride,
+    cache_h_stride,
+    cache_d0_stride,
+    cache_d1_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for linear attention decoding with KV cache.
+    
+    This kernel computes attention for a single token using the KV cache.
+    """
+    pid_b = tl.program_id(0)  # batch index
+    pid_h = tl.program_id(1)  # head index
+    pid_d = tl.program_id(2)  # dimension block index
+
+    # Load slot index for the current batch
+    slot_id = tl.load(slot_idx + pid_b)
+
+    # Skip if slot_id is -1 (padding)
+    if slot_id == -1:
+        return
+
+    batch_id = pid_b
+    head_id = pid_h
+
+    # Load decay rate for the current head
+    ratio = tl.load(slope_rate + pid_h)
+
+    # Calculate offsets for dimensions
+    qk_d_offsets = tl.arange(0, D)
+    v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
+    cache_d_offsets = qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[
+        None, :] * cache_d1_stride
+
+    # Calculate offsets for the current batch and head
+    q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+
+    cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride
+
+    # Create masks for loading tensors
+    qk_mask = qk_d_offsets < D
+    v_mask = v_d_offsets < D
+
+    # Load query, key, and value tensors
+    q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0)
+
+    # Compute key-value outer product
+    kv_outer = k[:, None] * v[None, :]
+    kv_mask = qk_mask[:, None] & v_mask[None, :]
+
+    # Apply decay to previous KV cache
+    ratio = tl.exp(-ratio)
+    kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets
+    kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0)
+    kv_outer = kv_outer + ratio * kv_cache_old
+
+    # Compute attention output
+    output = q[:, None].to(tl.float32) * kv_outer
+    output = tl.sum(output, axis=0)
+
+    # Update KV cache and store output
+    tl.store(kv_ptr, kv_outer, mask=kv_mask)
+    tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask)
+
+
+def linear_decode_forward_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_caches: torch.Tensor,
+    slope_rate: torch.Tensor,
+    slot_idx: torch.Tensor,
+    BLOCK_SIZE: int = 32,
+) -> torch.Tensor:
+    """
+    Perform linear attention decoding using Triton kernels.
+    
+    Args:
+        q: Query tensor of shape [B, H, 1, D]
+        k: Key tensor of shape [B, H, 1, D]
+        v: Value tensor of shape [B, H, 1, D]
+        kv_caches: Key-value cache tensor
+        slope_rate: Decay rate tensor
+        slot_idx: Slot indices for batches
+        BLOCK_SIZE: Size of blocks for processing
+        
+    Returns:
+        output: Attention output tensor
+    """
+    B, H, _, D = q.shape
+    assert k.shape == (B, H, 1, D)
+    assert v.shape == (B, H, 1, D)
+
+    # Initialize output tensor
+    output = torch.empty_like(q)
+
+    # Set grid dimensions for the kernel
+    grid = (B, H, D // BLOCK_SIZE)
+
+    # Calculate strides for tensors
+    qkv_b_stride = q.stride(0)
+    qkv_h_stride = q.stride(1)
+
+    cache_b_stride = kv_caches.stride(0)
+    cache_h_stride = kv_caches.stride(1)
+    cache_d0_stride = kv_caches.stride(2)
+    cache_d1_stride = kv_caches.stride(3)
+
+    # Launch the kernel
+    _linear_attn_decode_kernel[grid](
+        q,
+        k,
+        v,
+        kv_caches,
+        slope_rate,
+        slot_idx,
+        output,
+        D,
+        qkv_b_stride,
+        qkv_h_stride,
+        cache_b_stride,
+        cache_h_stride,
+        cache_d0_stride,
+        cache_d1_stride,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    # Reshape output and return
+    output = rearrange(output, "b h n d -> b n (h d)")
+    return output.squeeze(1).contiguous()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1ae574072b8ff..16500ab23e0f0 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -6,7 +6,6 @@ from typing import Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -17,6 +16,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 # yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            BlockQuantScaleParameter,
@@ -31,6 +31,8 @@ logger = init_logger(__name__)
 
 WEIGHT_LOADER_V2_SUPPORTED = [
     "CompressedTensorsLinearMethod",
+    "BitBLASLinearMethod",
+    "GPTQBitBLASLinearMethod",
     "AWQMarlinLinearMethod",
     "AWQLinearMethod",
     "GPTQMarlinLinearMethod",
@@ -50,6 +52,15 @@ WEIGHT_LOADER_V2_SUPPORTED = [
 ]
 
 
+def adjust_bitblas_shard(param, shard_size, shard_offset):
+    bitblas_tile_size = getattr(param, "bitblas_tile_size", None)
+    if bitblas_tile_size is not None:
+        return (shard_size // bitblas_tile_size,
+                shard_offset // bitblas_tile_size)
+
+    return shard_size, shard_offset
+
+
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
     if marlin_tile_size is None:
@@ -188,7 +199,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return F.linear(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
@@ -615,6 +626,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                shard_size, shard_offset = adjust_bitblas_shard(
+                    param, shard_size, shard_offset)
+
                 if use_bitsandbytes_4bit:
                     index = list(itertools.accumulate([0] + self.output_sizes))
                     orig_offsets = {
@@ -646,6 +660,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
+            shard_size, shard_offset = adjust_bitblas_shard(
+                param, shard_size, shard_offset)
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
@@ -1353,6 +1369,7 @@ class QKVCrossParallelLinear(LinearBase):
             prefix=f"{prefix}.kv_proj_encoder")
 
         # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.q_size = self.q_proj_decoder.output_size_per_partition
         self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
 
         if bias:
@@ -1364,20 +1381,31 @@ class QKVCrossParallelLinear(LinearBase):
         else:
             self.bias = None
 
+    def process_weights_after_loading(self):
+        for layer in self.proj.values():
+            if self.quant_method is not None:
+                self.quant_method.process_weights_after_loading(layer)
+
     @property
     def q_proj_decoder(self) -> ColumnParallelLinear:
         layer = self.proj["q_proj_decoder"]
         for name, param in self.named_parameters():
-            target_param = getattr(layer, name)
-            self.sync_weight_attrs(param, target_param, mode="q_proj_decoder")
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="q_proj_decoder")
         return layer
 
     @property
     def kv_proj_encoder(self) -> QKVParallelLinear:
         layer = self.proj["kv_proj_encoder"]
         for name, param in self.named_parameters():
-            target_param = getattr(layer, name)
-            self.sync_weight_attrs(param, target_param, mode="kv_proj_encoder")
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="kv_proj_encoder")
         return layer
 
     def sync_weight_attrs(
@@ -1466,11 +1494,14 @@ class QKVCrossParallelLinear(LinearBase):
                  if loaded_shard_id == "q" else self.kv_proj_encoder)
         target_param = self.select_proj_params(layer, param)
         shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
-        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+        if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED:
+            layer.weight_loader_v2(target_param, loaded_weight, *shard_id_args)
+        else:
+            layer.weight_loader(target_param, loaded_weight, *shard_id_args)
 
     def extra_repr(self) -> str:
         s = f"in_features={self.input_size}"
-        s += f", q_size={self.q_proj_decoder.output_size_per_partition}"
+        s += f", q_size={self.q_size}"
         s += f", kv_size={self.kv_size}"
         s += f", bias={self.bias is not None}"
         s += f", tp_size={get_tensor_model_parallel_world_size()}"
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
new file mode 100644
index 0000000000000..b1c46190403da
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+from dataclasses import dataclass
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionMetadata)
+from vllm.attention.backends.xformers import XFormersMetadata
+
+
+@dataclass
+class Mamba2Metadata:
+    has_prefill: bool
+
+    has_initial_states: torch.Tensor
+    prep_initial_states: bool
+
+    chunk_size: int
+    seq_idx: torch.Tensor
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+
+
+def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
+
+    # convert seq_idx to chunk indices and offsets
+    # - derive the cu_seqlens
+    _, cu_seqlens = torch.where(seq_idx.diff())
+    cu_seqlens += 1
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
+                                                     > 0).sum()
+    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
+    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
+
+    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return chunk_indices, chunk_offsets
+
+
+def prepare_mamba2_metadata(
+    chunk_size: int,
+    input_ids: torch.Tensor,
+    attn_metadata: AttentionMetadata,
+) -> Mamba2Metadata:
+
+    # Need flags to indicate if there are initial states
+    # currently we really only support the FlashAttention backend
+    has_initial_states = None
+    prep_initial_states = False
+    if (isinstance(attn_metadata, (FlashAttentionMetadata, XFormersMetadata,
+                                   PlaceholderAttentionMetadata))
+            and attn_metadata.context_lens_tensor is not None):
+        has_initial_states = attn_metadata.context_lens_tensor > 0
+        # precompute flag to avoid device syncs later in mamba2 forwards
+        prep_initial_states = torch.any(has_initial_states).item()
+
+    has_prefill = attn_metadata.num_prefills > 0
+
+    seq_idx = None
+    chunk_indices, chunk_offsets = None, None
+    if has_prefill:
+        seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
+        for i, (srt, end) in enumerate(
+                zip(
+                    attn_metadata.query_start_loc,
+                    attn_metadata.query_start_loc[1:],
+                )):
+            seq_idx[srt:end] = i
+        seq_idx.unsqueeze_(0)
+
+        # compute metadata for chunked prefill.
+        # actually this is only needed if there are initial states,
+        # but this is determinable only from attention metadata yet
+        # unavailable from the top-level model forward. Rather than
+        # complicating things to extract said metadata, we simply just
+        # compute them once at the top level model forward and reuse
+        # them in mamba layers. If not needed, they will be ignored
+        # inside mamba kernels.
+        chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+            seq_idx, chunk_size)
+
+    return Mamba2Metadata(has_prefill=has_prefill,
+                          has_initial_states=has_initial_states,
+                          prep_initial_states=prep_initial_states,
+                          chunk_size=chunk_size,
+                          seq_idx=seq_idx,
+                          chunk_indices=chunk_indices,
+                          chunk_offsets=chunk_offsets)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index d7a45bc51239a..d459c93a26b24 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -6,10 +6,6 @@ import torch
 from torch import nn
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.attention.backends.placeholder_attn import (
-    PlaceholderAttentionMetadata)
-from vllm.attention.backends.xformers import XFormersMetadata
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
@@ -18,6 +14,7 @@ from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
+from vllm.model_executor.layers.mamba.mamba2_metadata import Mamba2Metadata
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -221,7 +218,6 @@ class MambaMixer2(CustomOp):
                  head_dim: int = 64,
                  rms_norm_eps: float = 1e-5,
                  activation="silu",
-                 chunk_size: int = 256,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
@@ -257,7 +253,6 @@ class MambaMixer2(CustomOp):
         self.ssm_state_size = ssm_state_size
         self.activation = activation
 
-        self.chunk_size = chunk_size
         self.intermediate_size = intermediate_size
         self.head_dim = head_dim
         self.num_heads = num_heads
@@ -388,25 +383,17 @@ class MambaMixer2(CustomOp):
         self,
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
     ):
+        # mamba2_metadata contains metadata necessary for the mamba2 triton
+        # kernels to operate in continuous batching and in chunked prefill
+        # modes; they are computed at top-level model forward since they
+        # are the same and reused for all mamba layers in the same iteration
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
 
-        # detect if there are prefills
-        has_prefill = attn_metadata.num_prefills > 0
-
-        # - also need flags to indicate if there are initial states
-        # - currently we really only support the FlashAttention backend
-        has_initial_states = None
-        if (isinstance(attn_metadata,
-                       (FlashAttentionMetadata, XFormersMetadata,
-                        PlaceholderAttentionMetadata))
-                and attn_metadata.context_lens_tensor is not None):
-            has_initial_states = attn_metadata.context_lens_tensor > 0
-
         # 1. Gated MLP's linear projection
         projected_states, _ = self.in_proj(hidden_states)
         gate, hidden_states_B_C, dt = torch.split(
@@ -423,7 +410,7 @@ class MambaMixer2(CustomOp):
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
-        if has_prefill:
+        if mamba2_metadata.has_prefill:
             # |---------- N-1 iteration --------|
             # |---------------- N iteration ---------------------|
             # |- tokenA -|......................|-- newTokens ---|
@@ -439,7 +426,7 @@ class MambaMixer2(CustomOp):
                 self.conv1d.bias,
                 activation=self.activation,
                 conv_states=mamba_cache_params.conv_state,
-                has_initial_state=has_initial_states,
+                has_initial_state=mamba2_metadata.has_initial_states,
                 cache_indices=mamba_cache_params.state_indices_tensor,
                 query_start_loc=attn_metadata.query_start_loc).transpose(
                     0, 1)[:seq_len]
@@ -467,16 +454,15 @@ class MambaMixer2(CustomOp):
         )
 
         # 3. State Space Model sequence transformation
-        if has_prefill:
-
+        if mamba2_metadata.has_prefill:
             initial_states = None
-            if has_initial_states is not None and torch.any(
-                    has_initial_states):
-                zero_init_indices = mamba_cache_params.state_indices_tensor[
-                    ~has_initial_states]
-                mamba_cache_params.ssm_state[zero_init_indices] = 0
-                initial_states = mamba_cache_params.ssm_state[
-                    mamba_cache_params.state_indices_tensor]
+            if (mamba2_metadata.has_initial_states is not None
+                    and mamba2_metadata.prep_initial_states):
+                # making a copy of the states
+                initial_states = torch.where(
+                    mamba2_metadata.has_initial_states[:, None, None, None],
+                    mamba_cache_params.ssm_state[
+                        mamba_cache_params.state_indices_tensor], 0)
 
             scan_output, varlen_state = mamba_chunk_scan_combined(
                 hidden_states.view(1, seq_len, self.num_heads // self.tp_size,
@@ -485,11 +471,13 @@ class MambaMixer2(CustomOp):
                 self.A,
                 B.view(1, seq_len, self.n_groups // self.tp_size, -1),
                 C.view(1, seq_len, self.n_groups // self.tp_size, -1),
-                chunk_size=self.chunk_size,
+                chunk_size=mamba2_metadata.chunk_size,
                 D=self.D,
                 z=None,
                 dt_bias=self.dt_bias,
-                seq_idx=sequence_idx,
+                seq_idx=mamba2_metadata.seq_idx,
+                chunk_indices=mamba2_metadata.chunk_indices,
+                chunk_offsets=mamba2_metadata.chunk_offsets,
                 cu_seqlens=attn_metadata.query_start_loc,
                 initial_states=initial_states,
                 return_varlen_states=True,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 7ef5111227eb4..005917f236382 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -5,8 +5,6 @@
 
 # ruff: noqa: E501,SIM102
 
-import math
-
 import torch
 import triton
 import triton.language as tl
@@ -442,40 +440,6 @@ def _chunk_scan_fwd_kernel(
              (offs_out_n[None, :] < hdim))
 
 
-def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
-
-    # convert seq_idx to chunk indices and offsets
-    # - derive the cu_seqlens
-    _, cu_seqlens = torch.where(seq_idx.diff())
-    cu_seqlens += 1
-
-    # outputs will have length expansion of chunks that do not divide
-    # chunk_size
-    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
-                                                     > 0).sum()
-    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
-    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
-
-    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
-    p = 0  # num of insertions
-    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
-
-        # if does not divide chunk_size, then there is one chunk insertion
-        p += (s % chunk_size > 0)
-
-        # get the dimensions
-        # - the + 1 for _e is to shift the boundary by one chunk
-        # - this shifting is not needed if chunk_size divides e
-        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
-                                                             > 0)
-
-        # adjust inidces and offsets
-        chunk_indices[_s:_e] -= p
-        chunk_offsets[_s] = s % chunk_size
-
-    return chunk_indices, chunk_offsets
-
-
 def _chunk_scan_fwd(
     cb,
     x,
@@ -486,6 +450,8 @@ def _chunk_scan_fwd(
     D=None,
     z=None,
     seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
     initial_states=None,
 ):
     batch, seqlen, nheads, headdim = x.shape
@@ -502,7 +468,6 @@ def _chunk_scan_fwd(
     assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
     assert states.shape == (batch, nchunks, nheads, headdim, dstate)
 
-    chunk_indices, chunk_offsets = None, None
     if seq_idx is not None:
         assert seq_idx.shape == (batch, seqlen)
 
@@ -510,15 +475,19 @@ def _chunk_scan_fwd(
             # with initial states, we need to take care of how
             # seq_idx crosses the boundaries
             assert batch == 1, "chunk scan only supports initial states with batch 1"
-            assert initial_states.shape == (seq_idx[0].max() + 1, nheads,
-                                            headdim, dstate)
 
             if initial_states.shape[0] == 1:
                 # no in this case no point to use initial states
                 initial_states = None
             else:
-                chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
-                    seq_idx, chunk_size)
+                assert chunk_indices is not None and chunk_offsets is not None, \
+                    (
+                        "chunk_indices and chunk_offsets should have been set"
+                    )
+        else:
+            chunk_indices, chunk_offsets = None, None
+    else:
+        chunk_indices, chunk_offsets = None, None
 
     # Allocates output.
     out = torch.empty(batch,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 97cdb70b63cc6..3febd4ccb9929 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -30,6 +30,8 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    dt_bias=None,
                                    initial_states=None,
                                    seq_idx=None,
+                                   chunk_indices=None,
+                                   chunk_offsets=None,
                                    cu_seqlens=None,
                                    dt_softplus=False,
                                    dt_limit=(0.0, float("inf"))):
@@ -96,7 +98,7 @@ def _mamba_chunk_scan_combined_fwd(x,
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
     # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx and iii) has_cu_seqlens to be all specified.
+    #   ii) seq_idx and iii) is_cont_batched to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
     # - this will ensure that states will be updated with the rightmost flushed seq_idx
@@ -141,6 +143,8 @@ def _mamba_chunk_scan_combined_fwd(x,
         D=D,
         z=z,
         seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
         initial_states=initial_states,
     )
     if cu_seqlens is None:
@@ -170,6 +174,8 @@ def mamba_chunk_scan_combined(x,
                               dt_bias=None,
                               initial_states=None,
                               seq_idx=None,
+                              chunk_indices=None,
+                              chunk_offsets=None,
                               cu_seqlens=None,
                               dt_softplus=False,
                               dt_limit=(0.0, float("inf")),
@@ -210,6 +216,8 @@ def mamba_chunk_scan_combined(x,
         dt_bias=dt_bias,
         initial_states=initial_states,
         seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
         dt_limit=dt_limit)
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index d8f87c113f168..219c5306f4255 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -150,8 +150,6 @@ def _state_passing_fwd(
             #   are used for continuous batching. In which case we
             #   require seq_idx to be provided
             assert seq_idx is not None, ""
-            assert initial_states.shape == (seq_idx.max().item() + 1, nheads,
-                                            dim)
         else:
             # - this is the regular batching case, where initial
             #   states are used are for each example of the batch.
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 0012636ef9ffc..3f6ab64e4fa91 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -97,7 +97,7 @@ class SimplePooler(nn.Module):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data)
+        pooled_data = self.head(pooled_data, pooling_metadata)
         pooled_outputs = [self.build_output(data) for data in pooled_data]
         return PoolerOutput(outputs=pooled_outputs)
 
@@ -217,14 +217,28 @@ class PoolerHead(nn.Module):
         self.normalize = normalize
         self.softmax = softmax
 
-    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
+                pooling_metadata: PoolingMetadata):
+
+        dimensions_list = [
+            pooling_param.dimensions
+            for _, pooling_param in pooling_metadata.seq_groups
+        ]
+        if any(d is not None for d in dimensions_list):
+            # change the output dimension
+            assert len(pooled_data) == len(dimensions_list)
+            pooled_data = [
+                vecs if d is None else vecs[..., :d]
+                for vecs, d in zip(pooled_data, dimensions_list)
+            ]
+
         if self.normalize:
             if isinstance(pooled_data, list):
                 pooled_data = [
-                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                    F.normalize(data, p=2, dim=-1) for data in pooled_data
                 ]
             else:
-                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+                pooled_data = F.normalize(pooled_data, p=2, dim=-1)
 
         if self.softmax:
             if isinstance(pooled_data, list):
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a4dc4e9cbf2b2..9e1bf05dab9e7 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,9 +18,11 @@ QUANTIZATION_METHODS: List[str] = [
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
     "marlin",
+    "bitblas",
     "gguf",
     "gptq_marlin_24",
     "gptq_marlin",
+    "gptq_bitblas",
     "awq_marlin",
     "gptq",
     "compressed-tensors",
@@ -31,7 +33,8 @@ QUANTIZATION_METHODS: List[str] = [
     "neuron_quant",
     "ipex",
     "quark",
-    "moe_wna16"
+    "moe_wna16",
+    "torchao",
 ]
 
 # The customized quantization methods which will be added to this dict.
@@ -84,6 +87,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .aqlm import AQLMConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
+    from .bitblas import BitBLASConfig
     from .bitsandbytes import BitsAndBytesConfig
     from .compressed_tensors.compressed_tensors import (  # noqa: E501
         CompressedTensorsConfig)
@@ -93,6 +97,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .fp8 import Fp8Config
     from .gguf import GGUFConfig
     from .gptq import GPTQConfig
+    from .gptq_bitblas import GPTQBitBLASConfig
     from .gptq_marlin import GPTQMarlinConfig
     from .gptq_marlin_24 import GPTQMarlin24Config
     from .hqq_marlin import HQQMarlinConfig
@@ -103,6 +108,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
+    from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
 
     method_to_config: Dict[str, Type[QuantizationConfig]] = {
@@ -117,9 +123,11 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         # The order of gptq methods is important for config.py iteration over
         # override_quantization_method(..)
         "marlin": MarlinConfig,
+        "bitblas": BitBLASConfig,
         "gguf": GGUFConfig,
         "gptq_marlin_24": GPTQMarlin24Config,
         "gptq_marlin": GPTQMarlinConfig,
+        "gptq_bitblas": GPTQBitBLASConfig,
         "awq_marlin": AWQMarlinConfig,
         "gptq": GPTQConfig,
         "compressed-tensors": CompressedTensorsConfig,
@@ -132,6 +140,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "ipex": IPEXConfig,
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
+        "torchao": TorchAOConfig,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
@@ -143,4 +152,4 @@ __all__ = [
     "QuantizationConfig",
     "get_quantization_config",
     "QUANTIZATION_METHODS",
-]
+]
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 473816fcc3ecd..ef4a7765d61ef 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -17,14 +17,13 @@ from vllm.model_executor.layers.quantization.awq import (AWQConfig,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
-    check_marlin_supports_layer, marlin_make_empty_g_idx,
-    marlin_make_workspace, marlin_moe_permute_scales, marlin_permute_scales,
-    moe_awq_to_marlin_zero_points, verify_marlin_supported,
-    verify_marlin_supports_shape)
+    check_marlin_supports_layer, check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, moe_awq_to_marlin_zero_points,
+    verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
@@ -136,12 +135,15 @@ class AWQMarlinConfig(QuantizationConfig):
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            if layer.local_num_experts > 32:
-                # For MoEs with many experts the moe_wna16 kernel is faster
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_one(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            else:
-                return AWQMoEMethod(self)
+            return AWQMoEMethod(self)
         return None
 
     @classmethod
@@ -391,6 +393,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
 
+        device = layer.w13_qweight.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        layer.workspace = torch.zeros((sms * 4, ),
+                                      dtype=torch.int,
+                                      device=device,
+                                      requires_grad=False)
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_qweight.shape[0]
         device = layer.w13_qweight.device
@@ -469,12 +478,14 @@ class AWQMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-        if expert_map is not None:
+
+        if apply_router_weight_on_input:
             raise NotImplementedError(
-                "Expert Parallelism is not supported for "
+                "Apply router weight on input is not supported for"
                 "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -498,7 +509,10 @@ class AWQMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
+            workspace=layer.workspace,
             num_bits=self.quant_config.weight_bits,
         )
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
new file mode 100644
index 0000000000000..3eaaa6c252ced
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_NUM_BITS,
+    BITBLAS_SUPPORTED_SYM, MINIMUM_BITBLAS_VERSION)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class BitBLASConfig(QuantizationConfig):
+    """Config class for BitBLAS.
+
+    Reference: https://github.com/Microsoft/BitBLAS
+    """
+    TORCH_DTYPE = torch.float16
+    STORAGE_DTYPE = "int8"  # assume int8 storage
+    TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # gptq_with_bitblas prefer "quantized implementation"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: Optional[int],
+        desc_act: Optional[bool],
+        is_sym: Optional[bool],
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported.")
+
+        storage_dtype = self.STORAGE_DTYPE
+        storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+
+        self.storage_dtype = storage_dtype
+        self.storage_torch_dtype = self.TORCH_STORAGE_DTYPE
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+    def __repr__(self) -> str:
+        return (f"BitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any],
+                      keys: List[str],
+                      default: Any = None) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        return default
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "BitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"], -1)
+        desc_act = cls.get_from_keys(config, ["desc_act"], False)
+        is_sym = cls.get_from_keys(config, ["sym"], False)
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_bitblas_format: bool
+        is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
+                             or hf_quant_cfg.get("is_bitblas_format", False))
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "bitblas")
+
+        if is_bitblas_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. Using {} kernel.".
+                   format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["BitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return BitBLASLinearMethod(self)
+        return None
+
+
+class BitBLASLinearMethod(LinearMethodBase):
+    """Linear method for BitBLAS.
+
+    Args:
+        quant_config: The BitBLAS quantization config.
+    """
+    # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
+    # Instead of BITBLAS_OPTIMIZE_FEATURES
+    # If you want to high contiguous batching
+    # performance
+    OPT_FEATURES = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING = True
+    BITBLAS_DTYPES = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+
+    def __init__(self, quant_config: BitBLASConfig):
+        self.quant_config = quant_config
+
+    def create_weights_gptq(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing quantized 
+        weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_size_per_partition: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or if the 
+            input size per partition is not divisible by the group size in 
+            `quant_config`.
+        """
+        del input_size, output_size  # Unused arguments.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype not in self.quant_config.get_supported_act_dtypes():
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+        group_size = self.quant_config.group_size
+        if group_size is None:
+            group_size = -1
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if (group_size != -1 and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({group_size}).")
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self._configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            enable_tuning=self.ENABLE_TUNING,
+            bias=False,
+            layout="nt",
+            bits=self.quant_config.weight_bits,
+        )
+
+        # Initialize quantized weights with dimensions
+        # Quantized 4Bit weights packed.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                self.bitblas_matmul.retrieve_weight_shape(),
+                device="cuda",
+                dtype=self.quant_config.storage_torch_dtype,
+                requires_grad=False,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            bitblas_tile_size=(self.bitblas_matmul.retrieve_weight_shape()[-2]
+                               if self.bitblas_matmul.propagate_b else None),
+            weight_loader=weight_loader,
+        )
+
+        # Compute the number of input groups for channel-wise quantization.
+        input_groups = (1 if group_size == -1 else input_size_per_partition //
+                        group_size)
+
+        # Initialize scales and zeros for the quantized weights.
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                input_groups,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=0,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=0,
+                                              input_dim=1,
+                                              **weight_scale_args)
+
+        if self.quant_config.zeros_mode == "quantized":
+            zeros = PackedvLLMParameter(
+                data=torch.empty(
+                    input_groups,
+                    output_size_per_partition // self.quant_config.pack_factor,
+                    device="cuda",
+                    dtype=self.quant_config.storage_torch_dtype,
+                    requires_grad=False,
+                ),
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                weight_loader=weight_loader,
+            )
+
+        else:
+            zeros = BasevLLMParameter(
+                torch.empty(output_size_per_partition,
+                            input_groups,
+                            device="cuda",
+                            dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            # Set attributes to indicate how scales and zeros are applied.
+            set_weight_attrs(zeros, {
+                "input_dim": None if input_groups == 1 else 1,
+                "output_dim": 0,
+            })
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("zeros", zeros)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.quant_method == "gptq":
+            return self.create_weights_gptq(layer, input_size_per_partition,
+                                            output_partition_sizes, input_size,
+                                            output_size, params_dtype,
+                                            **extra_weight_attrs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+        out_dtype="float16",
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+
+        with_scaling = False
+        with_zeros = False
+        group_size = self.quant_config.group_size
+        zeros_mode = self.quant_config.zeros_mode
+        if self.quant_config.quant_method == "gptq":
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if self.quant_config.is_sym:
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+        matmul_config = MatmulConfig(
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=out_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=self.quant_config.STORAGE_DTYPE,
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                TUNING_MESSAGE = (f"BitBLAS Operator {config} is tuning ...")
+                logger.info(TUNING_MESSAGE)
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNED_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNED_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created."
+                logger.info(_message)
+        else:
+            _message = (
+                f"BitBLAS Operator {config} found in global_operator_cache.")
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.zeros
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        if self.quant_config.is_sym:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales)
+        else:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
+
+    def apply(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if self.quant_config.quant_method == "gptq":
+            return self.apply_gptq(*args, **kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1e8e7aa1b8c12..f5d32efe83688 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import direct_register_custom_op
 
 
 class BitsAndBytesConfig(QuantizationConfig):
@@ -321,9 +322,6 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             x: torch.Tensor,
             bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        # only load the bitsandbytes module when needed
-        from bitsandbytes import matmul_4bit
-
         original_type = x.dtype
         original_shape = x.shape
         reshape_after_matmul = False
@@ -343,19 +341,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
                           out_dim_1,
                           dtype=torch.bfloat16,
                           device=x.device)
-
-        current_index = 0
-        for i in range(len(quant_states)):
-            output_size = quant_states[i].shape[0]
-            # It is more efficient to use out kwarg like
-            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
-            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
-            # Need to change  after the bug is fixed.
-            out[:, current_index:current_index + output_size] = matmul_4bit(
-                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
-
-            current_index += output_size
-
+        apply_bnb_4bit(bf_x, qweight, offsets, out)
         out = out.to(original_type)
 
         if reshape_after_matmul:
@@ -365,3 +351,46 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             out += bias
 
         return out
+
+
+def _apply_bnb_4bit(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # only load the bitsandbytes module when needed
+    from bitsandbytes import matmul_4bit
+    quant_states = weight.bnb_quant_state
+    current_index = 0
+    for i in range(len(quant_states)):
+        output_size = quant_states[i].shape[0]
+        # It is more efficient to use out kwarg like
+        # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+        # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+        # Need to change  after the bug is fixed.
+        out[:, current_index:current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+        current_index += output_size
+
+
+def _apply_bnb_4bit_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="apply_bnb_4bit",
+        op_func=_apply_bnb_4bit,
+        mutates_args=["out"],
+        fake_impl=_apply_bnb_4bit_fake,
+    )
+    apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
+
+except AttributeError as error:
+    raise error
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce6c706fe3d27..cb9a48d7746bf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -96,7 +96,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self)
+            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
         return None
 
     @classmethod
@@ -191,17 +191,26 @@ class CompressedTensorsConfig(QuantizationConfig):
 
     def _check_scheme_supported(self,
                                 min_capability: int,
-                                error: bool = True) -> bool:
+                                error: bool = True,
+                                match_exact: bool = False) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
         if capability_tuple is not None:
             capability = capability_tuple.to_int()
-            supported = capability >= min_capability
-            if error and not supported:
-                raise RuntimeError(
-                    "Quantization scheme is not supported for ",
-                    f"the current GPU. Min capability: {min_capability}. ",
-                    f"Current capability: {capability}.")
+            if match_exact:
+                supported = capability == min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        "the current GPU. Required capability: ",
+                        f"{min_capability}. Current capability: {capability}.")
+            else:
+                supported = capability >= min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        f"the current GPU. Min capability: {min_capability}. ",
+                        f"Current capability: {capability}.")
             return supported
         else:
             return False
@@ -262,6 +271,11 @@ class CompressedTensorsConfig(QuantizationConfig):
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
+    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
+                          input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(90, error=False, match_exact=True)
+                and self._is_fp8_w8a8(weight_quant, input_quant))
+
     def _is_fp8_w8a16(self, weight_quant: BaseModel,
                       input_quant: BaseModel) -> bool:
         # Confirm weights quantized.
@@ -288,14 +302,12 @@ class CompressedTensorsConfig(QuantizationConfig):
     def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                                 input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_symmetric = weight_quant.symmetric
         is_channel_group = (
             weight_quant.strategy == QuantizationStrategy.CHANNEL.value
             or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return (is_channel_group and input_quant_none and is_symmetric
-                and is_static)
+        return (is_channel_group and input_quant_none and is_static)
 
     def _get_scheme_from_parts(
             self, weight_quant: BaseModel,
@@ -305,6 +317,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         if self._is_wNa16_group_channel(weight_quant, input_quant):
             if (self.quant_format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
+                assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
@@ -314,6 +327,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
+                    symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ff381a4cc1a7f..d74d4e9273b79 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -6,7 +6,8 @@ from typing import Callable, List, Optional
 
 import torch
 from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import QuantizationStrategy
+from compressed_tensors.quantization import (ActivationOrdering,
+                                             QuantizationStrategy)
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
@@ -30,8 +31,11 @@ class GPTQMarlinState(Enum):
 
 
 __all__ = [
-    "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
-    "CompressedTensorsWNA16MoEMethod"
+    "CompressedTensorsMoEMethod",
+    "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsW8A8Fp8MoECutlassMethod",
+    "CompressedTensorsWNA16MarlinMoEMethod",
+    "CompressedTensorsWNA16MoEMethod",
 ]
 
 
@@ -39,7 +43,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
     @staticmethod
     def get_moe_method(
-        quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        layer: torch.nn.Module,
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -48,7 +53,22 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
-            return CompressedTensorsWNA16MoEMethod(quant_config)
+            # Prefer to use the non-marlin kernel when:
+            # 1. Many experts (MarlinMoE gives poor performance when >= 16)
+            # 2. Non-FP16 dtype (MarlinMoE only supports FP16)
+            # 3. Actorder is not group/dynamic (g_idx is unsupported)
+            # 4. Scaled are grouped (channelwise is unsupported)
+            if ((layer.local_num_experts >= 16
+                 or layer.params_dtype != torch.float16) and
+                    weight_quant.actorder not in (ActivationOrdering.GROUP,
+                                                  ActivationOrdering.DYNAMIC)
+                    and weight_quant.strategy in QuantizationStrategy.GROUP):
+                return CompressedTensorsWNA16MoEMethod(quant_config)
+            else:
+                return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
+        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+              and layer.activation == "silu"):
+            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         else:
@@ -68,14 +88,23 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
 
-        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
                 f"{self.weight_quant}, {self.input_quant}")
 
         self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -103,24 +132,40 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
@@ -143,6 +188,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
             if (layer.w13_input_scale is None or layer.w2_input_scale is None):
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
@@ -184,24 +230,47 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
                                                           requires_grad=False)
 
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start:start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id])
-                layer.w13_weight[expert_id][
-                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                        dq_weight, max_w13_scales[expert_id])
-                start += shard_size
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
 
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                    requires_grad=False)
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled)
+
+        # Property to determine if AITER is used
+        if is_rocm_aiter_moe_enabled():
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
+                rocm_aiter_fused_experts, shuffle_weights)
+
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
+            self.fused_experts_func = rocm_aiter_fused_experts
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            self.fused_experts_func = fused_experts
 
     def apply(
         self,
@@ -218,9 +287,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -234,23 +303,249 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             use_fp8_w8a8=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+        return self.fused_experts_func(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=True,
+            per_channel_quant=self.weight_quant.strategy ==
+            QuantizationStrategy.CHANNEL,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
 
 
-class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
+            raise ValueError(
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+        device = w13_weight.device
+        # TODO strides can be shared across multiple layers
+        self.ab_strides1 = torch.full((num_experts, ),
+                                      hidden_size,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides1 = torch.full((num_experts, ),
+                                     2 * intermediate_size_per_partition,
+                                     device=device,
+                                     dtype=torch.int64)
+        self.ab_strides2 = torch.full((num_experts, ),
+                                      intermediate_size_per_partition,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides2 = torch.full((num_experts, ),
+                                     hidden_size,
+                                     device=device,
+                                     dtype=torch.int64)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer.")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+
+        assert activation == "silu"
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+
+        return cutlass_moe_fp8(
+            x,
+            layer.w13_weight.transpose(1, 2),
+            layer.w2_weight.transpose(1, 2),
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            topk_weights,
+            topk_ids,
+            self.ab_strides1,
+            self.c_strides1,
+            self.ab_strides2,
+            self.c_strides2,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            out_dtype=x.dtype,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
+
+class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
             self,
@@ -551,6 +846,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
@@ -558,6 +854,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             raise NotImplementedError(
                 "Expert Parallelism is not supported for "
                 "fused Marlin MoE method.")
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for "
+                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -586,3 +886,215 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.num_bits,
             is_k_full=self.is_k_full)
+
+
+class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        config = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.num_bits = config.num_bits
+        self.packed_factor = 32 // config.num_bits
+        self.strategy = config.strategy
+        # channelwise is not supported by this kernel
+        assert config.strategy == "group"
+        self.group_size = config.group_size
+        # grouped actorder isn't supported by this kernel
+        assert config.actorder != "group"
+        assert config.symmetric, (
+            "Only symmetric quantization is supported for MoE")
+
+        if not (self.quant_config.quant_format
+                == CompressionFormat.pack_quantized.value
+                and self.num_bits in WNA16_SUPPORTED_BITS):
+            raise ValueError("For Fused MoE layers, only ",
+                             f"{CompressionFormat.pack_quantized.value} ",
+                             "is supported for the following bits: ",
+                             f"{WNA16_SUPPORTED_BITS}")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Will transpose the loaded weight along the
+        # intermediate and hidden dim sizes. Will
+        # shard for TP along the transposed dims
+        extra_weight_attrs.update({
+            "is_transposed": True,
+            "quant_method": self.strategy
+        })
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.packed_factor,
+            2 * intermediate_size_per_partition,
+            dtype=torch.int32),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size_per_partition // self.packed_factor,
+            hidden_size,
+            dtype=torch.int32),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_scales_size = intermediate_size_per_partition
+
+        if self.strategy == "channel":
+            num_groups_w2 = num_groups_w13 = 1
+            self.group_size = -1
+        else:
+            num_groups_w2 = w2_scales_size // self.group_size
+            num_groups_w13 = hidden_size // self.group_size
+
+        w13_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            num_groups_w13,
+            2 * intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_scale)
+        set_weight_attrs(w13_scale, extra_weight_attrs)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 num_groups_w2,
+                                                 hidden_size,
+                                                 dtype=params_dtype),
+                                      requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_scale)
+        set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": False})
+
+        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                              requires_grad=False)
+
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        layer.a13_scale = None
+        layer.a2_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Reconfigure packed weights and scales to match moe_wna16 format
+        layer.w13_weight_packed = torch.nn.Parameter(
+            layer.w13_weight_packed.transpose(1, 2).contiguous().view(
+                torch.uint8),
+            requires_grad=False)
+        layer.w2_weight_packed = torch.nn.Parameter(
+            layer.w2_weight_packed.transpose(1,
+                                             2).contiguous().view(torch.uint8),
+            requires_grad=False)
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+        assert activation == "silu", "Only SiLU activation is supported."
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_int4_w4a16=self.num_bits == 4,
+            use_int8_w8a16=self.num_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size])
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 27a74d677da7b..e99a452963f48 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -23,6 +23,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
 
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
+        self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
@@ -143,5 +144,6 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 38df09ff39373..3535dd3f3f147 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -12,11 +12,15 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
+                                           PackedColumnParameter,
                                            PackedvLLMParameter,
                                            RowvLLMParameter)
+# yapf: enable
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -26,6 +30,7 @@ WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
     8: scalar_types.uint8b128
 }
+WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8}
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
 
@@ -36,10 +41,12 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                  strategy: str,
                  num_bits: int,
                  group_size: Optional[int] = None,
+                 symmetric: Optional[bool] = True,
                  actorder: Optional[ActivationOrdering] = None):
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
+        self.symmetric = symmetric
         self.group_size = -1 if group_size is None else group_size
         self.has_g_idx = actorder == ActivationOrdering.GROUP
 
@@ -53,7 +60,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                 f"Unsupported num_bits = {num_bits}. "
                 f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
 
-        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
+        self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
+                           if not self.symmetric else
+                           WNA16_SUPPORTED_TYPES_MAP[num_bits])
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -75,7 +84,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             weight_type=self.quant_type,
             act_type=params_dtype,
             group_size=self.group_size,
-            zero_points=False,
+            zero_points=not self.symmetric,
             has_g_idx=self.has_g_idx
         )
 
@@ -120,13 +129,37 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                 dtype=params_dtype,
             )
         }
+
+        zeros_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.zeros(
+                output_size_per_partition // self.pack_factor,
+                scales_and_zp_size,
+                dtype=torch.int32,
+            )
+        }
+
         if not partition_scales:
             weight_scale = ChannelQuantScaleParameter(output_dim=0,
                                                       **weight_scale_args)
+
+            if not self.symmetric:
+                qzeros = PackedColumnParameter(output_dim=0,
+                                               packed_dim=0,
+                                               packed_factor=self.pack_factor,
+                                               **zeros_args)
         else:
             weight_scale = GroupQuantScaleParameter(output_dim=0,
                                                     input_dim=1,
                                                     **weight_scale_args)
+            if not self.symmetric:
+                qzeros = PackedvLLMParameter(input_dim=1,
+                                             output_dim=0,
+                                             packed_dim=0,
+                                             packed_factor=self.pack_factor,
+                                             **zeros_args)
 
         # A 2D array defining the original shape of the weights
         # before packing
@@ -138,6 +171,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
 
+        if not self.symmetric:
+            layer.register_parameter("weight_zero_point", qzeros)
+
         # group index (for activation reordering)
         if self.has_g_idx:
             weight_g_idx = RowvLLMParameter(data=torch.empty(
@@ -151,7 +187,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         self.kernel = kernel_type(mp_linear_kernel_config,
                                   w_q_param_name="weight_packed",
                                   w_s_param_name="weight_scale",
-                                  w_zp_param_name=None,
+                                  w_zp_param_name="weight_zero_point",
                                   w_gidx_param_name="weight_g_idx")
 
     # Checkpoints are serialized in compressed-tensors format, which is
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index d18ca55afebdb..be19b80975ecb 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -113,6 +113,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -129,18 +130,20 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             use_int8_w8a16=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_scale,
-                             w2_scale=layer.w2_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int8_w8a16=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scale,
+            w2_scale=layer.w2_scale)
 
     @staticmethod
     def quantizing_weight_loader(layer, weight_loader):
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 1cc431c5cc7be..7dddc40f3446d 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -73,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.out_dtype = torch.get_default_dtype()
 
     def create_weights(
         self,
@@ -161,6 +162,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=None,
                                      input_scale_ub=layer.input_scale_ub,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d92b0931a6ee0..be76785baccce 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import importlib.util
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -37,6 +38,14 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
 
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _is_col_major(x: torch.Tensor) -> bool:
+    assert x.dim() == 3
+    b, m, n = x.shape
+    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m
+
 
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
@@ -107,7 +116,9 @@ class Fp8Config(QuantizationConfig):
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
@@ -116,6 +127,21 @@ class Fp8Config(QuantizationConfig):
             return Fp8KVCacheMethod(self)
         return None
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        return None
+
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
@@ -138,6 +164,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.out_dtype = torch.get_default_dtype()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -227,6 +254,7 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_loader=weight_loader,
                 )
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 layer.register_parameter("weight_scale", scale)
             else:
                 assert self.quant_config.activation_scheme == "dynamic"
@@ -241,6 +269,7 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_loader=weight_loader,
                 )
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 # The weight_scale_inv name is intentional for deepseekv3
                 layer.register_parameter("weight_scale_inv", scale)
 
@@ -251,6 +280,7 @@ class Fp8LinearMethod(LinearMethodBase):
                                                 weight_loader=weight_loader)
 
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "input_scale"})
                 layer.register_parameter("input_scale", scale)
             else:
                 layer.register_parameter("input_scale", None)
@@ -386,6 +416,7 @@ class Fp8LinearMethod(LinearMethodBase):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
 
@@ -407,6 +438,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
+        # Check for DeepGemm support.
+        self.allow_deep_gemm = False
+        if envs.VLLM_USE_DEEP_GEMM:
+            if not has_deep_gemm:
+                logger.warning_once("Failed to import DeepGemm kernels.")
+            elif (current_platform.is_cuda()
+                  and current_platform.has_device_capability(90)):
+                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
+                self.allow_deep_gemm = True
+            else:
+                logger.warning_once(
+                    "DeepGemm not supported on the current platform.")
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -529,6 +573,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # Lazy import to avoid importing triton too early.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
+
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
@@ -554,6 +602,28 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
+            # DeepGemm scales need to be transposed and aligned.  We try to do
+            # it ahead of time for performance reasons.
+            if self.allow_deep_gemm:
+                # Lazy import to avoid CUDA initialization problems.
+                import deep_gemm as dg
+                if _is_col_major(layer.w13_weight_scale_inv):
+                    layer.w13_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
+                if _is_col_major(layer.w2_weight_scale_inv):
+                    layer.w2_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous()
+
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -581,6 +651,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                w13_scales, w2_scales = expand_weights(
+                    layer.w13_weight_scale.data,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=[
+                        layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                    ])
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_scales.contiguous(), requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp8, we need to handle that the
@@ -648,6 +738,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                expansion_dims = [
+                    layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                ]
+                max_w13_scales, w2_scales = expand_weights(
+                    max_w13_scales,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=expansion_dims)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
             return
@@ -667,6 +777,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -694,6 +805,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             activation=activation,
             use_fp8_w8a8=True,
             global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             expert_map=expert_map,
             w1_scale=(layer.w13_weight_scale_inv
                       if self.block_quant else layer.w13_weight_scale),
@@ -702,6 +814,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
+            allow_deep_gemm=self.allow_deep_gemm,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index c8ab12d9a0aa2..6b499f81c55fa 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -117,7 +117,7 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -338,9 +338,15 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ):
         assert activation == "silu", "Only SiLU activation is supported."
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused GGUF MoE method.")
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -377,7 +383,7 @@ class GGUFEmbeddingMethod(GGUFLinearMethod):
         x_flat = x.flatten()
         quant = torch.index_select(qweight, dim=0, index=x_flat)
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0]).to(self.params_dtype)
+                                      x_flat.shape[0], self.params_dtype)
         return dequant.view(*x.shape, hidden_size)
 
 
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
new file mode 100644
index 0000000000000..88cada4c61b83
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -0,0 +1,438 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Set
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    BitBLASLinearKernel, MPLinearLayerConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION, bitblas_repeat_scales_on_all_ranks,
+    check_bitblas_supported, verify_bitblas_supported)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class GPTQBitBLASConfig(QuantizationConfig):
+    """Config class for GPTQ BitBLAS"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    TORCH_DTYPE = torch.float16
+    GPTQ_CKPT_STORAGE_DTYPE = (
+        "int32"  # GPTQ Default Checkpoints use int32 as storage dtype
+    )
+    GPTQ_BITBLAS_STORAGE_DTYPE = "int8"  # BitBLAS uses int8 as storage dtype
+    TORCH_BITBLAS_STORAGE_DTYPE = getattr(torch, GPTQ_BITBLAS_STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # the gptq_bitblas prefer "quantized"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in GPTQ_BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported.")
+
+        self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE
+
+        storage_nbit = int("".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE
+                                   if c.isdigit()))
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})"
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQBitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "bitblas"
+                               or user_quant == "gptq_bitblas")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_bitblas"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_bitblas for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQBitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return GPTQBitBLASLinearMethod(self)
+        return None
+
+    @property
+    def torch_storage_dtype(self) -> torch.dtype:
+        return self.TORCH_BITBLAS_STORAGE_DTYPE
+
+    @classmethod
+    def is_gptq_bitblas_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+
+        # Otherwise, can convert if model satisfies bitblas constraints.
+        return check_bitblas_supported(quant_type=cls.TYPE_MAP[(num_bits,
+                                                                sym)],
+                                       group_size=group_size)
+
+
+class GPTQBitBLASLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ BitBLAS.
+
+    Args:
+        quant_config: The GPTQ BitBLAS quantization config.
+    """
+
+    kernel_type = BitBLASLinearKernel
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, quant_config: GPTQBitBLASConfig) -> None:
+        self.quant_config = quant_config
+        # Verify supported on platform.
+        verify_bitblas_supported(quant_type=self.quant_config.quant_type,
+                                 group_size=self.quant_config.group_size)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing 
+        quantized weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_partition_sizes: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or 
+            if the input size per partition is not divisible by the 
+            group size in `quant_config`.
+        """
+        if params_dtype != torch.float16:
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({self.quant_config.group_size})."
+            )
+
+        kernel_type = self.kernel_type
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQBitBLASLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if bitblas_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                              self.quant_config.group_size,
+                                              is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Init buffers
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        # Activation order
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            },
+        )
+
+        # Quantized zero-points
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="qweight",
+            w_s_param_name="scales",
+            w_zp_param_name="qzeros",
+            w_gidx_param_name="g_idx",
+            bitblas_quant_config=self.quant_config,
+        )
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self.kernel.configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            bias=False,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.kernel.apply_gptq_bitblas_linear(layer, x)
+        if bias is not None:
+            out.add_(bias)
+        return out
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9f53ffc1d7f6a..52cd0a5b69757 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -15,13 +15,13 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
-from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_marlin_supported, marlin_moe_permute_scales,
-    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
+    check_marlin_supported, check_moe_marlin_supports_layer,
+    marlin_moe_permute_scales, marlin_repeat_scales_on_all_ranks,
+    verify_marlin_supported)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -153,12 +153,15 @@ class GPTQMarlinConfig(QuantizationConfig):
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, FusedMoE):
-            if layer.local_num_experts > 32:
-                # For MoEs with many experts the moe_wna16 kernel is faster
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_one(
+                    f"Layer '{prefix}' is not supported by GPTQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            else:
-                return GPTQMarlinMoEMethod(self)
+            return GPTQMarlinMoEMethod(self)
         return get_linear_quant_method(self, layer, prefix,
                                        GPTQMarlinLinearMethod)
 
@@ -408,7 +411,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             torch.empty(num_experts,
                         scales_size13,
                         2 * intermediate_size_per_partition,
-                        dtype=torch.half),
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
@@ -418,7 +421,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             torch.empty(num_experts,
                         scales_size2,
                         hidden_size,
-                        dtype=torch.half),
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
@@ -493,6 +496,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
                                  w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
+        device = layer.w13_qweight.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        layer.workspace = torch.zeros((sms * 4, ),
+                                      dtype=torch.int,
+                                      device=device,
+                                      requires_grad=False)
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # Process act_order
@@ -592,13 +602,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-
-        # The input must currently be float16
-        orig_dtype = x.dtype
-        x = x.half()
+        if apply_router_weight_on_input is not None:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -621,9 +632,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             g_idx1=layer.w13_g_idx,
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.quant_config.quant_type.size_bits,
-            is_k_full=self.is_k_full).to(orig_dtype)
+            workspace=layer.workspace,
+            is_k_full=self.is_k_full)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 520e1bc96721c..d144bb4361045 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -5,6 +5,8 @@ from typing import List, Optional, Type
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
     AllSparkLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import (  # noqa: E501
+    BitBLASLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
     ExllamaLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
@@ -20,6 +22,7 @@ _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
     MacheteLinearKernel,
     AllSparkLinearKernel,
     MarlinLinearKernel,
+    BitBLASLinearKernel,
     ExllamaLinearKernel,
 ]
 
@@ -76,4 +79,4 @@ def choose_mp_linear_kernel(
     raise ValueError(
         "Failed to find a kernel that can implement the "\
         "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
+        + '\n'.join(failure_reasons))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
new file mode 100644
index 0000000000000..21452d08b8a1c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION, bitblas_make_empty_g_idx, bitblas_sort_g_idx,
+    check_bitblas_supports_shape, query_bitblas_supported_quant_types,
+    unpack_gptq_qweight, unpack_gptq_qzeros)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+logger = init_logger(__name__)
+
+
+class BitBLASLinearKernel(MPLinearKernel):
+
+    OPT_FEATURES: List[int] = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING: bool = True
+    MATMUL_LAYOUT: str = "nt"
+    BITBLAS_DTYPES: Dict[torch.dtype, str] = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+    bitblas_matmul: object = None
+
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: Optional[str] = None,
+        w_gidx_param_name: Optional[str] = None,
+        bitblas_quant_config: Optional[QuantizationConfig] = None,
+    ):
+        self.quant_config = bitblas_quant_config
+        super().__init__(c, w_q_param_name, w_s_param_name, w_zp_param_name,
+                         w_gidx_param_name)
+
+    def repack_bitblas_from_gptq(
+        self,
+        b_q_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: Optional[torch.Tensor] = None,
+    ):
+        from bitblas.quantization.utils import general_compress
+        assert self.bitblas_matmul is not None, "bitblas_matmul is None"
+
+        quant_config = self.quant_config
+        # qweight in gptq old quant linear stored with
+        # (outfeatures, infeatures), should be transposed.
+        qweight = b_q_weight.T.contiguous().view(
+            quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(
+            qweight,
+            quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
+            qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
+                intweight.cpu()).cuda()
+        # scales in gptq old quant linear stored with
+        # (infeatures // group_size, outfeatures), should be transposed.
+        scales = scales.T.contiguous()
+
+        if qzeros is None:
+            return qweight, scales, None
+
+        # qzeros should be de-quantized to int zeros.
+        weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
+        intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
+        zeros: Optional[torch.Tensor] = None
+        zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
+        if zeros_mode == "original":
+            zeros = intzeros.to(torch.float16).contiguous()
+        elif zeros_mode == "rescale":
+            assert zeros is not None, "zeros should not be None"
+            zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :]
+        elif zeros_mode == "quantized":
+            zeros = (
+                torch.Tensor(
+                    general_compress(
+                        intzeros.T.contiguous().cpu().numpy(),
+                        weight_bits,
+                    )).to(qweight.device).
+                to(quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                   ).contiguous())
+        else:
+            raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
+
+        return qweight, scales, zeros
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        is_bitblas_installed = True
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError:
+            is_bitblas_installed = False
+
+        if not is_bitblas_installed:
+            return False, "bitblas is not installed. Please install bitblas "\
+                          "by running `pip install bitblas>="\
+                           f"{MINIMUM_BITBLAS_VERSION}`"
+
+        quant_types = query_bitblas_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, (f"Quant type ({c.weight_type}) not supported by"
+                           f"  BitBLAS, supported types are: {quant_types}")
+
+        if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+            return False, (f"Group size ({c.group_size}) not supported by "
+                           "BitBLAS, supported group sizes are: "
+                           f"{BITBLAS_SUPPORTED_GROUP_SIZES}")
+
+        return check_bitblas_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        quant_config = self.quant_config
+
+        # Default names since bitblas requires empty parameters for these,
+        # TODO: remove this requirement from bitblas (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "qzeros"
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device)
+
+        if c.zero_points:
+            raise NotImplementedError("Zero points not supported by BitBLAS")
+        else:
+            setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
+
+        # Repack weights
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = (
+            self.repack_bitblas_from_gptq(
+                layer.qweight,
+                layer.scales,
+                None if quant_config.is_sym else  # type: ignore[union-attr]
+                layer.qzeros,  # type: ignore[union-attr]
+            ))
+        replace_parameter(layer, self.w_q_name, bitblas_qweight)
+        replace_parameter(layer, self.w_s_name, bitblas_scales)
+        if bitblas_qzeros is not None:
+            replace_parameter(layer, self.w_zp_name, bitblas_qzeros)
+
+    def configure_bitblas_matmul(
+        self,
+        infeatures: int,
+        outfeatures: int,
+        params_dtype: torch.dtype,
+        bias: bool,
+    ) -> None:
+        enable_tuning = self.ENABLE_TUNING
+        layout = self.MATMUL_LAYOUT
+        bits = self.quant_config.weight_bits  # type: ignore[union-attr]
+        self._configure_bitblas_matmul(
+            infeatures,
+            outfeatures,
+            params_dtype,
+            enable_tuning,
+            bias,
+            layout,
+            bits,
+        )
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+        quant_config = self.quant_config
+        with_scaling = False
+        with_zeros = False
+        group_size = quant_config.group_size  # type: ignore[union-attr]
+        zeros_mode = quant_config.zeros_mode  # type: ignore[union-attr]
+        if quant_config.quant_method == "gptq":  # type: ignore[union-attr]
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if quant_config.is_sym:  # type: ignore[union-attr]
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {quant_config.quant_method}"  # type: ignore[union-attr]
+            )  # type: ignore[union-attr]
+
+        matmul_config = MatmulConfig(
+            M=self.OPT_FEATURES,
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=bitblas_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=quant_config.  # type: ignore[union-attr]
+            storage_dtype,  # type: ignore[union-attr]
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNING_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNING_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created without tuning. "
+                logger.info(_message)
+        else:
+            _message = f"BitBLAS Operator {config} retrieved from cache."
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq_bitblas_linear(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        output_size_per_partition = self.config.partition_weight_shape[1]
+        out_shape = x.shape[:-1] + (output_size_per_partition, )
+        args = [x, layer.qweight, layer.scales]
+        if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
+            args.append(layer.qzeros)
+        output = self.bitblas_matmul(*args)  # type: ignore[operator]
+        return output.view(out_shape)
+
+    def apply_weights(self, layer, x, bias=None):
+        NOT_IMPLEMENT_MESSAGE = (
+            f"{self.__class__.__name__}.apply_weights is not implemented. "
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead")
+        raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 3f0586f6e30d6..b3ffeca4f100e 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -26,17 +26,14 @@ class MacheteLinearKernel(MPLinearKernel):
     @classmethod
     def can_implement(cls,
                       c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
         if c.has_g_idx and\
             c.partition_weight_shape[0] != c.full_weight_shape[0]:
             return False, "Act reordering currently not supported by Machete, "\
                           "when the input features are partitioned across "\
                           "devices"
-
         if c.zero_points:
-            return False, "Zero points currently not supported by "\
-                          " Compressed Tensors + Machete. (Kernel supports it"\
-                          " but CompressedTensorsWNA16 does not so support has"\
-                          " not been added to MacheteWNA16Kernel yet"
+            return False, "Zero points currently not supported by Machete"
 
         if c.weight_type not in query_machete_supported_quant_types(
                 c.zero_points):
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index e21801cf6a785..7bd824ff9e551 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
     check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
     marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
-    query_marlin_supported_quant_types)
+    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 
@@ -25,10 +25,6 @@ class MarlinLinearKernel(MPLinearKernel):
     @classmethod
     def can_implement(cls,
                       c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
-        if c.zero_points:
-            return False, "Zero points currently not supported by "\
-                          " MarlinLinearKernel. Will be added when AWQMarlin "\
-                          "is migrated over to using MPLinearKernel backend"
 
         quant_types = query_marlin_supported_quant_types(c.zero_points)
         if c.weight_type not in quant_types:
@@ -67,28 +63,6 @@ class MarlinLinearKernel(MPLinearKernel):
         if self.w_zp_name is None:
             self.w_zp_name = "w_zp"
 
-        if c.has_g_idx:
-            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
-                getattr(layer, self.w_gidx_name))
-            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
-            layer.g_idx_sort_indices = g_idx_sort_indices
-        else:
-            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
-            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
-
-        if c.zero_points:
-            pass
-            # TODO (lucas): add the following when AWQMarlin is migrated over to
-            #       using MPLinearKernel backend
-            # self._transform_param(layer, self.w_zp_name, lambda x: \
-            #     marlin_zero_points(
-            #         x,
-            #         size_k=c.partition_weight_shape[0],
-            #         size_n=c.partition_weight_shape[1],
-            #         num_bits=c.weight_type.size_bits))
-        else:
-            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
-
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
@@ -108,6 +82,28 @@ class MarlinLinearKernel(MPLinearKernel):
                                            group_size=c.group_size)
             return x
 
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        if c.zero_points:
+            grouped_k = (c.partition_weight_shape[0] //
+                         c.group_size if c.group_size != -1 else 1)
+            self._transform_param(layer, self.w_zp_name, lambda x: \
+                marlin_zero_points(
+                    unpack_cols(x.t(), c.weight_type.size_bits,
+                                grouped_k,
+                                c.partition_weight_shape[1]),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits))
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
@@ -131,5 +127,6 @@ class MarlinLinearKernel(MPLinearKernel):
             wtype=c.weight_type,
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
+            has_zp=self.config.zero_points,
             is_k_full=self.is_k_full,
             bias=bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index a5967995ac88d..014108e695065 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -3,6 +3,8 @@
 import os
 from typing import Dict, List, Optional, Type
 
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
     CutlassScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
@@ -17,7 +19,7 @@ from vllm.platforms import PlatformEnum, current_platform
 _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
-    PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
+    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
 
@@ -27,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
         compute_capability: Optional[int] = None
 ) -> Type[ScaledMMLinearKernel]:
     """
-    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    Choose an ScaledMMLinearKernel that can implement the given config for the 
     given compute capability. Attempts to choose the best kernel in terms of 
     performance.
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
new file mode 100644
index 0000000000000..582b12f76562c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_rocm():
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "currently supported on non-ROCm platform.")
+
+        try:
+            import aiter  # noqa: F401 # deliberately attempt to import aiter
+        except Exception:
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "installed on ROCm.")
+        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
+        if not (
+            envs.VLLM_ROCM_USE_AITER_LINEAR \
+            and envs.VLLM_ROCM_USE_AITER
+        ):
+            return (False, "AiterScaledMMLinearKernel is disabled. " +
+                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
+                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
+                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+
+        if not c.input_symmetric:
+            return (False,
+                    "AiterScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        `AiterScaledMMLinearKernel` implements a fused version of
+            `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+        where scale_a * a and scale_b * b are implemented using numpy-style
+        broadcasting.
+        Currently only support per-tensor-per-tensor GEMM
+        and per-token-per-channel GEMM through AITER
+        w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support
+        ATIER block scaled GEMM and mix-precision GEMM.
+        """
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        assert symmetric, ("AiterScaledMMLinearKernel only supports"
+                           " symmetric quantization.")
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
+                              " symmetric quantization.")
+        out_dtype = x.dtype
+
+        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
+        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+        assert bias is None or bias.shape[0] == w_q.shape[
+            1] and bias.dtype == out_dtype
+
+        m = x_q.shape[0]  # a
+        n = w_q.shape[1]  # b
+
+        per_tensor_scale_a = (x_s.numel() == 1)
+        per_tensor_scale_b = (w_s.numel() == 1)
+        per_token_scale_a = (x_s.numel() == m)
+        per_channel_scale_b = (w_s.numel() == n)
+
+        # @TODO:
+        # Maybe broadcast the per-tensor-scale into per-channel-scale
+        # if one of the scale is a per-channel-scale.
+        # For now, it only supports:
+        # - per-tensor-per-tensor a8w8 scaled GEMM, and
+        # - per-token-per-channel a8w8 scaled GEMM
+        assert ((per_tensor_scale_a and per_tensor_scale_b)
+                or (per_token_scale_a and per_channel_scale_b)), (
+                    "Currently only support per-tensor-per-tensor GEMM " +
+                    " and per-token-per-channel GEMM through AITER"
+                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
+                    "does not support AITER block scaled GEMM.")
+
+        from aiter import gemm_a8w8_CK
+
+        # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+        # a to be [M, K]
+        # b to be [N, K]
+        # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+        return gemm_a8w8_CK(x_q, w_q.t(), x_s, w_s, bias).to(out_dtype)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 0bf090d7fab3c..089314071d39e 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -97,7 +97,8 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
                                              block_size=-1,
                                              int4_weight=False,
                                              quantize_activation=True)
-
+        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        out = out.to(x.dtype)
         # Explicitly capture control flow to make dynamo happy.
         # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
         return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 41b75c9be05a4..00c4b661ef2cc 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -293,6 +293,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -312,21 +313,23 @@ class MoeWNA16Method(FusedMoEMethodBase):
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
 
-        return fused_experts(x,
-                             layer.w13_qweight,
-                             layer.w2_qweight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_int4_w4a16=weight_bits == 4,
-                             use_int8_w8a16=weight_bits == 8,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_scales,
-                             w2_scale=layer.w2_scales,
-                             w1_zp=layer.w13_qzeros if has_zp else None,
-                             w2_zp=layer.w2_qzeros if has_zp else None,
-                             block_shape=[0, layer.group_size])
+        return fused_experts(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w1_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size])
 
     @staticmethod
     def get_weight_loader(layer, weight_loader):
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index bc26a455c6f28..d1146c0f039d8 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -202,6 +202,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -217,16 +219,18 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_fp8_w8a8=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 3e4251e46931c..afd4bb722dad9 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -21,7 +21,8 @@ class QuarkW8A8Fp8(QuarkScheme):
     def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
-        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
+        self.out_dtype = torch.get_default_dtype()
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -40,10 +41,11 @@ class QuarkW8A8Fp8(QuarkScheme):
             )
 
             if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
-                    input_scale=layer.input_scale)
+                    input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -56,11 +58,12 @@ class QuarkW8A8Fp8(QuarkScheme):
             weight = layer.weight
 
             if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=layer.weight_scale,
-                        input_scale=layer.input_scale)
+                        input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -104,7 +107,7 @@ class QuarkW8A8Fp8(QuarkScheme):
         # the newly added parameters
         if self.qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
+                data=torch.empty((sum(output_partition_sizes)),
                                  dtype=torch.float32),
                 output_dim=0,
                 weight_loader=weight_loader)
@@ -134,5 +137,6 @@ class QuarkW8A8Fp8(QuarkScheme):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 1bf34b098938c..da8ed8c08506d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -35,7 +35,7 @@ class QuarkW8A8Int8(QuarkScheme):
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
 
         scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
             is_channelwise=(self.qscheme == "per_channel"),
@@ -63,16 +63,28 @@ class QuarkW8A8Int8(QuarkScheme):
         # WEIGHT SCALE
         if self.qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
+                data=torch.empty((sum(output_partition_sizes)),
                                  dtype=torch.float32),
                 output_dim=0,
                 weight_loader=weight_loader)
+            ChannelQuantZPParameter = ChannelQuantScaleParameter
+            weight_zero_point = ChannelQuantZPParameter(
+                data=torch.empty((sum(output_partition_sizes)),
+                                 dtype=torch.int8),
+                output_dim=0,
+                weight_loader=weight_loader)
         else:
             assert self.qscheme == "per_tensor"
             weight_scale = PerTensorScaleParameter(data=torch.empty(
                 len(output_partition_sizes), dtype=torch.float32),
                                                    weight_loader=weight_loader)
+            PerTensorZPParameter = PerTensorScaleParameter
+            weight_zero_point = PerTensorZPParameter(
+                data=torch.empty(len(output_partition_sizes),
+                                 dtype=torch.int8),
+                weight_loader=weight_loader)
         layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight_zero_point", weight_zero_point)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
@@ -81,14 +93,10 @@ class QuarkW8A8Int8(QuarkScheme):
                                             weight_loader=weight_loader)
             layer.register_parameter("input_scale", input_scale)
 
-            if not self.input_symmetric:
-                # Note: quark stores the zp using the same dtype
-                # as the weights
-                # AZP loaded as int8 but used as int32
-                input_zero_point = BasevLLMParameter(
-                    data=torch.empty(1, dtype=torch.int8),
-                    weight_loader=weight_loader)
-                layer.register_parameter("input_zero_point", input_zero_point)
+            input_zero_point = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.int8),
+                                                 weight_loader=weight_loader)
+            layer.register_parameter("input_zero_point", input_zero_point)
 
         self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
                                   w_q_param_name="weight",
@@ -100,6 +108,12 @@ class QuarkW8A8Int8(QuarkScheme):
     # Checkpoints are serialized in quark format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.register_parameter("weight_zero_point", None)
+        delattr(layer, 'weight_zero_point')
+        if self.input_symmetric:
+            layer.register_parameter("input_zero_point", None)
+            delattr(layer, 'input_zero_point')
+
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
new file mode 100644
index 0000000000000..5c2babcf4ab63
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class TorchAOConfig(QuantizationConfig):
+    """Config class for torchao."""
+
+    def __init__(self, torchao_config) -> None:
+        self.torchao_config = torchao_config
+
+    def __repr__(self) -> str:
+        return f"TorchAOConfig({self.torchao_config})"
+
+    def get_name(self) -> str:
+        return "torchao"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return ["config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "TorchAOConfig":
+        """Create the quant config from an hf model config"""
+        try:
+            from torchao.core.config import config_from_dict
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchao>=0.10.0 via "
+                "`pip install torchao>=0.10.0` to use torchao quantization."
+            ) from err
+
+        hf_config = cls.get_from_keys_or(config, ["quant_type"], None)
+        assert hf_config is not None, "quant_type must be specified"
+        assert (len(hf_config) == 1 and "default" in hf_config
+                ), "Expected only one key 'default' in quant_type dictionary"
+        quant_type = hf_config["default"]
+        ao_config = config_from_dict(quant_type)
+        return cls(ao_config)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["TorchAOLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return TorchAOLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+def torchao_quantize_param_data(param: torch.Tensor,
+                                torchao_config: Any) -> torch.nn.Parameter:
+    """Quantize a Tensor with torchao quantization specified by torchao_config
+
+    Args:
+       `param`: weight parameter of the linear module
+       `torchao_config`: type of quantization and their arguments we want to
+        use to quantize the Tensor
+    """
+    from torchao.core.config import AOBaseConfig
+    from torchao.quantization import quantize_
+    assert isinstance(torchao_config, AOBaseConfig)
+    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
+    dummy_linear.weight = param
+    quantize_(dummy_linear, torchao_config)
+    return dummy_linear.weight
+
+
+class TorchAOLinearMethod(LinearMethodBase):
+    """Linear method for torchao.
+
+    Args:
+        torchao_config: The torchao quantization config, a string
+        that encodes the type of quantization and all relevant arguments.
+    """
+
+    def __init__(self, quant_config: TorchAOConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        weight = torchao_quantize_param_data(weight,
+                                             self.quant_config.torchao_config)
+
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
new file mode 100644
index 0000000000000..5d28d327e8a2f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+MINIMUM_BITBLAS_VERSION = "0.1.0"
+
+BITBLAS_MIN_WEIGHT_SIZE_N = 16
+BITBLAS_MIN_WEIGHT_SIZE_K = 16
+GPTQ_BITBLAS_MAX_PARALLEL = 16
+
+BITBLAS_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# For dynamic shape code generation
+BITBLAS_OPTIMIZE_FEATURES = [1, 16, 32, 64, 128, 256, 512, 1024]
+# If want to enable high performance for contiguous batching
+# Please use the following values
+BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS = [16, 32, 64, 128, 256, 512, 1024]
+
+BITBLAS_SUPPORTED_NUM_BITS = [1, 2, 4, 8]
+BITBLAS_SUPPORTED_SYM = [False, True]
+
+
+# Determines the supported quantization types for BitBLAS based on the
+# device's capability and whether zero-point (zp) is used.
+def query_bitblas_supported_quant_types(has_zp: bool,
+                                        device_capability: Optional[int] = None
+                                        ):
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    if device_capability < 70:
+        return []
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_bitblas is merged into "gptq_bitblas" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+
+
+def _check_bitblas_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    supported_types = query_bitblas_supported_quant_types(
+        has_zp, device_capability)
+
+    if quant_type not in supported_types:
+        return (False, f"BitBLAS does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES):
+        return (False, f"BitBLAS does not support group_size = {group_size}. "
+                f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    return True, None
+
+
+def check_bitblas_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False,
+                            device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_bitblas_supported(quant_type, group_size, has_zp,
+                                       device_capability)
+    return cond
+
+
+def verify_bitblas_supported(quant_type: ScalarType,
+                             group_size: int,
+                             has_zp: bool = False) -> None:
+    cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_bitblas_supports_shape(output_size_per_partition: int,
+                                  input_size_per_partition: int,
+                                  input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def check_bitblas_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> Tuple[bool, Optional[str]]:
+    try:
+        verify_bitblas_supports_shape(output_size_per_partition,
+                                      input_size_per_partition, input_size,
+                                      group_size)
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                       is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_sort_g_idx(
+        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor:
+    qzeros = qzeros.view(torch.int32)
+    elems_per_int32 = 32 // bits
+    unpacked_zeros = torch.zeros(
+        (qzeros.shape[0], qzeros.shape[1] * elems_per_int32),
+        dtype=torch.int8,
+        device=qzeros.device,
+        requires_grad=False,
+    )
+
+    for col in range(unpacked_zeros.shape[1]):
+        i = col % elems_per_int32
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >>
+                                  (bits * i)) & 0xF
+    if not is_gptq_v2:
+        return unpacked_zeros + 1
+    return unpacked_zeros
+
+
+def unpack_gptq_qweight(qweight, bits):
+    qweight = qweight.view(torch.int8)
+    elems_per_int8 = 8 // bits
+    unpacked_weight = torch.zeros(
+        (qweight.shape[0], qweight.shape[1] * elems_per_int8),
+        dtype=torch.int8,
+        device=qweight.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_weight.shape[1]):
+        i = col % elems_per_int8
+        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >>
+                                   (bits * i))
+
+    return torch.bitwise_and(unpacked_weight, 2**bits - 1)
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
new file mode 100644
index 0000000000000..98b06b6c2ae96
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.platforms import current_platform
+
+logger = logging.getLogger(__name__)
+
+
+def apply_w8a8_block_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
+    output = w8a8_block_int8_matmul(q_input,
+                                    weight,
+                                    x_scale,
+                                    weight_scale,
+                                    block_size,
+                                    output_dtype=input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_int8(
+        x: torch.Tensor,
+        dtype: torch.dtype = torch.int8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to int8 values with
+    tensor-wise quantization."""
+    iinfo = torch.iinfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    int8_min, int8_max = iinfo.min, iinfo.max
+    scale = int8_max / amax
+    x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> torch.Tensor:
+    """This function conducts block-wise dequantization.
+    The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are dequantized tensor.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block[
+                j * block_n:min((j + 1) * block_n, n),
+                i * block_k:min((i + 1) * block_k, k),
+            ] *= x_s[j][i]
+
+    return x_dq_block
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask,
+                other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+
+
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1, ),
+                         device=x.device,
+                         dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M, )](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+    return x_q, scales
+
+
+@triton.jit
+def _per_token_group_quant_int8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for int8
+    int8_min,
+    int8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+
+    This function converts the tensor values into int8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / int8_max
+    y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.int8`
+            is supported for now.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+            scaling factor for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_int8[(M, )](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        int8_min=int8_min,
+        int8_max=int8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_int8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:,
+                                                         None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
+                                block_k: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block INT8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        ("Using default W8A8 Block INT8 kernel config. Performance might "
+         "be sub-optimal! Config file not found at %s"),
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_int8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be
+            2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_int8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index d1fb52ae09def..4a190480d35b6 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -151,6 +151,19 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
         group_size=group_size)[0]
 
 
+def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    hidden_size = layer.hidden_size
+    intermediate_size_per_partition = layer.intermediate_size_per_partition
+
+    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
+    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
+    # moe marlin requires n % 128 == 0 and k % 64 == 0
+    return hidden_size % 128 == 0 and \
+        intermediate_size_per_partition % max(64, group_size) == 0 and \
+        group_size in [-1, 32, 64, 128]
+
+
 def marlin_make_workspace(output_size_per_partition: int,
                           device: torch.device) -> torch.Tensor:
     max_workspace_size = (output_size_per_partition //
@@ -305,7 +318,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
-    return max(m, 64) * n < 64 * 2048 and k >= 2048
+    return n < 2048 and k >= 2048
 
 
 def apply_gptq_marlin_linear(
@@ -319,6 +332,7 @@ def apply_gptq_marlin_linear(
         wtype: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
+        has_zp: bool,
         is_k_full: bool,
         bias: Optional[torch.Tensor] = None,
         use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
@@ -343,8 +357,8 @@ def apply_gptq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
-                                  has_zp=False,
                                   use_atomic_add=use_atomic_add,
+                                  has_zp=has_zp,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 9de8e453354cd..d279ffe45d6d7 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 
@@ -17,6 +18,7 @@ TORCH_DEVICE_IDENTITY = None
 # The condition is determined once as the operations
 # are time consuming.
 USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
+                               and torch.__version__[0:3] >= "2.7"
                                and current_platform.has_device_capability(94))
 
 
@@ -50,6 +52,16 @@ def cutlass_block_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_block_fp8(capability)
 
 
+def cutlass_group_gemm_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_group_gemm_supported(capability)
+
+
 CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
 CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 
@@ -121,6 +133,159 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                           out_dtype: torch.dtype, scale_a: torch.Tensor,
+                           scale_b: torch.Tensor, bias: torch.Tensor,
+                           output_shape: List, **kwargs) -> torch.Tensor:
+
+    # Fused GEMM_DQ
+    output = ops.cutlass_scaled_mm(qinput,
+                                   weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=scale_a,
+                                   scale_b=scale_b,
+                                   bias=bias)
+    return output.view(*output_shape)
+
+
+def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and qinput.shape[
+            0] == 1 and qinput.shape[1] % 16 == 0:
+        output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
+                               current_platform.get_cu_count())
+    else:
+        output = torch._scaled_mm(qinput,
+                                  weight,
+                                  out_dtype=out_dtype,
+                                  scale_a=scale_a,
+                                  scale_b=scale_b,
+                                  bias=bias)
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                    weight: torch.Tensor,
+                                    out_dtype: torch.dtype,
+                                    scale_a: torch.Tensor,
+                                    scale_b: torch.Tensor, bias: torch.Tensor,
+                                    input_2d: torch.Tensor,
+                                    output_shape: List) -> torch.Tensor:
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b,
+                              bias=bias)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
+    #  when using it.
+    #  For now it has only been validated on ROCm platform.
+    #  fp8 rowwise scaling in torch._scaled_mm is introduced in
+    #  https://github.com/pytorch/pytorch/pull/144432 using
+    #  hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+    #
+    #  For CUDA platform please validate if the torch._scaled_mm supports
+    #  rowwise scaled GEMM before using it
+
+    # Fused GEMM_DQ Rowwise GEMM
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b.t(),
+                              bias=bias)
+
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = output.view(*output_shape)
+    return output
+
+
+def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                     weight: torch.Tensor,
+                                     out_dtype: torch.dtype,
+                                     scale_a: torch.Tensor,
+                                     scale_b: torch.Tensor, bias: torch.Tensor,
+                                     input_2d: torch.Tensor,
+                                     output_shape: List,
+                                     **kwargs) -> torch.Tensor:
+    # Use unfused DQ due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+
+    # GEMM
+    # This computes C = (X * W).
+    # Output in fp32 to allow subsequent ops to happen in-place
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              scale_a=TORCH_DEVICE_IDENTITY,
+                              scale_b=TORCH_DEVICE_IDENTITY,
+                              out_dtype=torch.float32)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    # Unpad (undo num_token_padding)
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+
+    # DQ
+    # C = sw * sx * (X * W) + bias
+    output = output * x_scale * scale_b.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(out_dtype).view(*output_shape)
+
+
+def dispatch_w8a8_scaled_mm(
+        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        per_tensor_activations: bool, use_per_token_if_dynamic: Optional[bool]
+) -> Callable[..., torch.Tensor]:
+
+    if cutlass_fp8_supported:
+        return cutlass_w8a8_scaled_mm
+    if per_tensor_weights and per_tensor_activations:
+        if current_platform.is_rocm():
+            return rocm_per_tensor_w8a8_scaled_mm
+        return torch_per_tensor_w8a8_scaled_mm
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    if (use_per_token_if_dynamic and not per_tensor_weights
+            and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
+        return torch_per_token_w8a8_scaled_mm
+    return torch_channelwise_w8a8_scaled_mm
+
+
 # TODO(luka): follow similar pattern for marlin and block-fp8-linear
 #  https://github.com/vllm-project/vllm/issues/14397
 class Fp8LinearOp:
@@ -146,13 +311,15 @@ class Fp8LinearOp:
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = config.level < CompilationLevel.PIECEWISE
-        self.output_padding = 17 if pad_output else None
+        self.output_padding = 17 if (
+            pad_output and not current_platform.is_rocm()) else None
 
     def apply(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        out_dtype: Optional[torch.dtype] = None,
         input_scale: Optional[torch.Tensor] = None,
         input_scale_ub: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
@@ -172,111 +339,44 @@ class Fp8LinearOp:
         if use_per_token_if_dynamic is None:
             use_per_token_if_dynamic = self.use_per_token_if_dynamic
 
+        if out_dtype is None:
+            out_dtype = input.dtype
+
         # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
         if self.cutlass_fp8_supported:
+            assert input.dtype != current_platform.fp8_dtype(
+            ), "FP8 input to cutlass is not currently implemented"
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
                 scale_ub=input_scale_ub,
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm(qinput,
-                                           weight,
-                                           out_dtype=input.dtype,
-                                           scale_a=x_scale,
-                                           scale_b=weight_scale,
-                                           bias=bias)
-            return output.view(*output_shape)
-
-        # torch.scaled_mm supports per tensor weights + activations only
-        # so fallback to naive if per channel or per token
         else:
-            # Maybe apply padding to output, see comment in __init__
-            qinput, x_scale = ops.scaled_fp8_quant(
-                input_2d,
-                input_scale,
-                num_token_padding=self.output_padding,
-                use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-            per_tensor_weights = (weight_scale.numel() == 1)
-            per_tensor_activations = (x_scale.numel() == 1)
-
-            if per_tensor_weights and per_tensor_activations:
-                # Fused GEMM_DQ
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=input.dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
-
-                return torch.narrow(output, 0, 0,
-                                    input_2d.shape[0]).view(*output_shape)
-
-            elif (use_per_token_if_dynamic and not per_tensor_weights
-                  and not per_tensor_activations
-                  and USE_ROWWISE_TORCH_SCALED_MM):
-                # For now validated on ROCm platform
-                # fp8 rowwise scaling in torch._scaled_mm is introduced in
-                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
-                # and ROCm 6.3, which only exists in torch 2.7 and above.
-                # For CUDA platform please validate if the
-                # torch._scaled_mm support rowwise scaled GEMM
-                # Fused GEMM_DQ Rowwise GEMM
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=input.dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale.t(),
-                                          bias=bias)
-
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                output = output.view(*output_shape)
-                return output
-
+            if input.dtype != current_platform.fp8_dtype():
+                # Maybe apply padding to output, see comment in __init__
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=self.output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic)
             else:
-                # Fallback for channelwise case, where we use unfused DQ
-                # due to limitations with scaled_mm
+                qinput, x_scale = input_2d, input_scale
 
-                # Symmetric quantized GEMM by definition computes the following:
-                #   C = (s_x * X) (s_w * W) + bias
-                # This is equivalent to dequantizing the weights and activations
-                # before applying a GEMM.
-                #
-                # In order to compute quantized operands, a quantized kernel
-                # will rewrite the above like so:
-                #   C = s_w * s_x * (X * W) + bias
-                #
-                # For the scaled_mm fallback case, we break this down, since it
-                # does not support s_w being a vector.
+        per_tensor_weights = (weight_scale.numel() == 1)
+        per_tensor_activations = (x_scale.numel() == 1)
 
-                # GEMM
-                # This computes C = (X * W).
-                # Output in fp32 to allow subsequent ops to happen in-place
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          scale_a=TORCH_DEVICE_IDENTITY,
-                                          scale_b=TORCH_DEVICE_IDENTITY,
-                                          out_dtype=torch.float32)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
-                # Unpad (undo num_token_padding)
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
+            self.cutlass_fp8_supported, per_tensor_weights,
+            per_tensor_activations, use_per_token_if_dynamic)
 
-                # DQ
-                # C = sw * sx * (X * W) + bias
-                output = output * x_scale * weight_scale.t()
-                if bias is not None:
-                    output = output + bias
-                return output.to(dtype=input.dtype).view(*output_shape)
+        return w8a8_scaled_mm_func(qinput=qinput,
+                                   weight=weight,
+                                   out_dtype=input.dtype,
+                                   scale_a=x_scale,
+                                   scale_b=weight_scale,
+                                   bias=bias,
+                                   input_2d=input_2d,
+                                   output_shape=output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index fd27775b7dc0c..e6f2461eb674c 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -851,6 +851,70 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         return new_freqs
 
 
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches,
+                    dtype=torch.int32) \
+                    .reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = ((frequencies_x + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs_y = ((frequencies_y + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y],
+                          dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(
+            *query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(
+            *key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""
 
@@ -924,8 +988,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
-    @staticmethod
+    @classmethod
     def get_input_positions(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Optional[Union[List[List[int]], torch.Tensor]],
@@ -933,6 +998,8 @@ class MRotaryEmbedding(RotaryEmbedding):
         second_per_grid_ts: Optional[List[float]],
         context_len: int = 0,
         seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
@@ -942,7 +1009,48 @@ class MRotaryEmbedding(RotaryEmbedding):
             second_per_grid_ts
 
         llm_positions, mrope_position_delta = \
-            MRotaryEmbedding.get_input_positions_tensor(
+            cls.get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @classmethod
+    def get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: List[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        from vllm.transformers_utils.config import thinker_uses_mrope
+        if thinker_uses_mrope(hf_config):
+            return cls._omni_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+        else:
+            return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
                 hf_config=hf_config,
                 image_grid_thw=image_grid_thw,
@@ -952,10 +1060,9 @@ class MRotaryEmbedding(RotaryEmbedding):
                 seq_len=seq_len,
             )
 
-        return llm_positions.tolist(), mrope_position_delta
-
-    @staticmethod
-    def get_input_positions_tensor(
+    @classmethod
+    def _vl_get_input_positions_tensor(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
@@ -973,11 +1080,6 @@ class MRotaryEmbedding(RotaryEmbedding):
         tokens_per_second = getattr(hf_config.vision_config,
                                     "tokens_per_second", 1.0)
 
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-
         input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
             input_tokens_tensor == vision_start_token_id).squeeze(1)
@@ -1057,6 +1159,226 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _omni_get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: Optional[List[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            if src_item[idx] not in [
+                    audio_token_id, video_token_id, image_token_id
+            ]:
+                if src_item[idx] == vision_end_token_id and use_audio_in_video:
+                    start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx],
+                                           dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                t_index_split_chunk = cls._split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk)
+                new_src_item.extend([audio_start_token_id])
+                start_idx -= 1
+                llm_pos_ids_list.extend([
+                    torch.tensor([start_idx], dtype=torch.long).expand(3, -1)
+                ] * 1)
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: List[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = len(
+                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    new_src_item.extend([video_token_id] *
+                                        vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
+                        start_idx + 1, video_idx, spatial_merge_size, t_chunk,
+                        grid_hs, grid_ws).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len -
+                            added_audio_len) * [audio_token_id])
+                    audio_start_idx = start_idx if len(
+                        audio_llm_pos_ids_list
+                    ) == 0 else audio_llm_pos_ids_list[-1][0].item()
+                    if min(t_ntoken_per_chunk,
+                           pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (torch.arange(
+                            min(t_ntoken_per_chunk, pure_audio_len -
+                                added_audio_len)).expand(3, -1) +
+                                                  audio_start_idx + 1).split(
+                                                      1, dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(t_ntoken_per_chunk,
+                                           pure_audio_len - added_audio_len)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id])
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(
+                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
+                                1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                llm_pos_ids_list.extend([
+                    torch.tensor(
+                        [llm_pos_ids_list[-1].max() + 1] * 3).unsqueeze(1)
+                ] * 1)
+                new_src_item.extend([audio_end_token_id])
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = torch.cat(llm_pos_ids_list,
+                                         dim=1).max() + 1 - len(src_item)
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @staticmethod
+    def _get_llm_pos_ids_for_vision(
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: List[int],
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ) -> torch.Tensor:
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
+            len(t_index), -1, llm_grid_w).flatten())
+        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
+            len(t_index), llm_grid_h, -1).flatten())
+        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
+            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
+        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+
+    @staticmethod
+    def _split_list_into_ranges(lst: torch.Tensor,
+                                interval: int) -> List[List[int]]:
+        ranges: List[List[int]] = [[]
+                                   for _ in range((max(lst) // interval) + 1)]
+        for num in lst:
+            index = num // interval
+            ranges[index].append(num)
+        return ranges
+
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
@@ -1080,6 +1402,58 @@ class MRotaryEmbedding(RotaryEmbedding):
             mrope_position_delta + seq_len,
         ).expand(3, -1)
 
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[List[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> List[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+                   tokens_per_second).long()
+        t_index_split_chunk = cls._split_list_into_ranges(
+            t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
+                spatial_merge_size**2)
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk,
+                                   audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
+
 
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
@@ -1130,6 +1504,10 @@ def get_rope(
                                                scaling_factor, low_freq_factor,
                                                high_freq_factor,
                                                original_max_position)
+        elif scaling_type == "mllama4":
+            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
+                                                     max_position, base,
+                                                     is_neox_style, dtype)
         elif scaling_type == "default":
             if "mrope_section" in rope_scaling:
                 rotary_emb = MRotaryEmbedding(
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index a9ef973917e19..686d031f7b728 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Utility methods for model layers."""
-from typing import Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
 
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.platforms import current_platform
+
 
 def get_token_bin_counts_and_mask(
     tokens: torch.Tensor,
@@ -47,12 +51,48 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
         output_tokens_tensor, vocab_size, num_seqs)
     repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
         1, vocab_size)
-    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
-                                      repetition_penalties, 1.0)[logits > 0]
-    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
-                                       repetition_penalties, 1.0)[logits <= 0]
+
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
     logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
     logits -= presence_penalties.unsqueeze(dim=1) * output_mask
     return logits
+
+
+def rocm_unquantized_gemm(x: torch.Tensor,
+                          weight: torch.Tensor,
+                          bias: Optional[torch.Tensor] = None):
+    k = weight.shape[1]
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and \
+                    x.dtype in [torch.float16, torch.bfloat16] \
+                    and k % 8 == 0 and bias is None)
+
+    if use_skinny is not True:
+        return torch.nn.functional.linear(x, weight, bias)
+
+    x_view = x.view(-1, x.size(-1))
+    n = x_view.shape[0]
+    m = weight.shape[0]
+    cu_count = current_platform.get_cu_count()
+
+    if m > 8 and n < 4:
+        out = ops.wvSplitK(weight, x_view, cu_count)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    elif m % 4 == 0 and n == 1 and k <= 8192:
+        out = ops.LLMM1(weight, x_view, out, 4)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
+    if current_platform.is_rocm():
+        return rocm_unquantized_gemm
+    return torch.nn.functional.linear
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index f65dfc3cb3294..1eb0c8c2ef4e1 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -235,7 +235,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
-        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        is_embedding_layer = type(self) is VocabParallelEmbedding
         quant_method_implements_embedding = method_has_implemented_embedding(
             type(quant_method))
         if is_embedding_layer and not quant_method_implements_embedding:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c969f18b822c4..cb9100e355945 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -33,11 +33,15 @@ from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.model_executor.layers.linear import (LinearBase,
                                                MergedColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizeMethodBase)
 from vllm.model_executor.model_loader.tensorizer import (
@@ -111,10 +115,12 @@ def _initialize_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
+    model_class: Optional[type[nn.Module]] = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
-    model_class, _ = get_model_architecture(model_config)
+    if model_class is None:
+        model_class, _ = get_model_architecture(model_config)
 
     if vllm_config.quant_config is not None:
         configure_quant_config(vllm_config.quant_config, model_class)
@@ -158,6 +164,11 @@ def _initialize_model(
 def _process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
                                    target_device: torch.device) -> None:
     for _, module in model.named_modules():
+        if isinstance(module, QKVCrossParallelLinear):
+            # NOTE(Isotr0py): special case for cross QKV layer because
+            # q and kv proj aren't registered as submodules intentionally
+            module.process_weights_after_loading()
+            continue
         quant_method = getattr(module, "quant_method", None)
         if isinstance(quant_method, QuantizeMethodBase):
             # When quant methods need to process weights after loading
@@ -403,7 +414,7 @@ class DefaultModelLoader(BaseModelLoader):
         return ((source.prefix + name, tensor)
                 for (name, tensor) in weights_iterator)
 
-    def _get_all_weights(
+    def get_all_weights(
         self,
         model_config: ModelConfig,
         model: nn.Module,
@@ -442,7 +453,7 @@ class DefaultModelLoader(BaseModelLoader):
 
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
-                self._get_all_weights(model_config, model))
+                self.get_all_weights(model_config, model))
             self.counter_after_loading_weights = time.perf_counter()
             logger.info(
                 "Loading weights took %.2f seconds",
@@ -600,8 +611,12 @@ class ShardedStateLoader(BaseModelLoader):
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
 
-    def __init__(self, load_config: LoadConfig):
+    def __init__(self,
+                 load_config: LoadConfig,
+                 runai_model_streamer: bool = False):
         super().__init__(load_config)
+
+        self.runai_model_streamer = runai_model_streamer
         extra_config = ({} if load_config.model_loader_extra_config is None
                         else load_config.model_loader_extra_config.copy())
         self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
@@ -648,7 +663,7 @@ class ShardedStateLoader(BaseModelLoader):
 
     def _prepare_weights(self, model_name_or_path: str,
                          revision: Optional[str]):
-        if os.path.isdir(model_name_or_path):
+        if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
             return model_name_or_path
         else:
             allow_patterns = ["*.safetensors"]
@@ -667,12 +682,13 @@ class ShardedStateLoader(BaseModelLoader):
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
-        from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
 
-        local_model_path = self._prepare_weights(model_config.model,
-                                                 model_config.revision)
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        local_model_path = model_weights
 
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -684,40 +700,56 @@ class ShardedStateLoader(BaseModelLoader):
                 local_model_path,
                 self.pattern.format(rank=rank, part="*"),
             )
-            filepaths = glob.glob(pattern)
+
+            filepaths = []
+            if is_s3(local_model_path):
+                file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
+                filepaths = s3_glob(path=local_model_path,
+                                    allow_pattern=[file_pattern])
+            else:
+                filepaths = glob.glob(pattern)
             if not filepaths:
                 # TODO: support un-sharded checkpoints too
                 raise ValueError(
                     f"Could not find checkpoint files '{pattern}', only "
                     f"pre-sharded checkpoints are currently supported!")
             state_dict = self._filter_subtensors(model.state_dict())
-            for path in filepaths:
-                with safe_open(path, framework="pt") as f:
-                    for key in f.keys():  # noqa: SIM118
-                        tensor = f.get_tensor(key)
-                        # If loading with LoRA enabled, additional padding may
-                        # be added to certain parameters. We only load into a
-                        # narrowed view of the parameter data.
-                        param_data = state_dict[key].data
-                        param_shape = state_dict[key].shape
-                        for dim, size in enumerate(tensor.shape):
-                            if size < param_shape[dim]:
-                                param_data = param_data.narrow(dim, 0, size)
-                        if tensor.shape != param_shape:
-                            logger.warning(
-                                "loading tensor of shape %s into "
-                                "parameter '%s' of shape %s",
-                                tensor.shape,
-                                key,
-                                param_shape,
-                            )
-                        param_data.copy_(tensor)
-                        state_dict.pop(key)
+            for key, tensor in self.iterate_over_files(filepaths):
+                # If loading with LoRA enabled, additional padding may
+                # be added to certain parameters. We only load into a
+                # narrowed view of the parameter data.
+                param_data = state_dict[key].data
+                param_shape = state_dict[key].shape
+                for dim, size in enumerate(tensor.shape):
+                    if size < param_shape[dim]:
+                        param_data = param_data.narrow(dim, 0, size)
+                if tensor.shape != param_shape:
+                    logger.warning(
+                        "loading tensor of shape %s into "
+                        "parameter '%s' of shape %s",
+                        tensor.shape,
+                        key,
+                        param_shape,
+                    )
+                param_data.copy_(tensor)
+                state_dict.pop(key)
             if state_dict:
                 raise ValueError(
                     f"Missing keys {tuple(state_dict)} in loaded state!")
         return model.eval()
 
+    def iterate_over_files(
+            self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        if self.runai_model_streamer:
+            yield from runai_safetensors_weights_iterator(paths, True)
+        else:
+            from safetensors.torch import safe_open
+            for path in paths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        yield key, tensor
+
     @staticmethod
     def save_model(
         model: torch.nn.Module,
@@ -1259,6 +1291,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                          pack_ratio)
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy
+                offsets = torch.tensor(offsets).cpu()
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
 
                 if load_8bit:
@@ -1502,4 +1536,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.RUNAI_STREAMER:
         return RunaiModelStreamerLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER_SHARDED:
+        return ShardedStateLoader(load_config, runai_model_streamer=True)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index d900fb3a7d397..67aaad10fcfe9 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -174,8 +174,39 @@ def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
 
 def _get_neuron_config_after_override(default_neuron_config,
                                       overridden_neuron_config):
-    from transformers_neuronx.config import NeuronConfig
+    from transformers_neuronx.config import (ContinuousBatchingConfig,
+                                             GenerationConfig,
+                                             KVCacheQuantizationConfig,
+                                             NeuronConfig, QuantizationConfig,
+                                             SparseAttnConfig)
+
     overridden_neuron_config = overridden_neuron_config or {}
+    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
+    if sparse_attn:
+        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
+            **sparse_attn)
+
+    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
+    if kv_cache_quant:
+        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
+            **kv_cache_quant)
+
+    continuous_batching = overridden_neuron_config.pop("continuous_batching",
+                                                       {})
+    if continuous_batching:
+        overridden_neuron_config[
+            "continuous_batching"] = ContinuousBatchingConfig(
+                **continuous_batching)
+
+    quant = overridden_neuron_config.pop("quant", {})
+    if quant:
+        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
+
+    on_device_generation = overridden_neuron_config.pop(
+        "on_device_generation", {})
+    if on_device_generation:
+        overridden_neuron_config["on_device_generation"] = GenerationConfig(
+            **on_device_generation)
     default_neuron_config.update(overridden_neuron_config)
     return NeuronConfig(**default_neuron_config)
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ce90614329725..af4f2e95a9afe 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -36,16 +36,13 @@ def is_transformers_impl_compatible(
     mod = module or getattr(transformers, arch, None)
     if mod is None:
         return False
-    if hasattr(mod, "supports_backend"):
-        return mod.is_backend_compatible()
-    else:
-        return mod._supports_flex_attn
+    return mod.is_backend_compatible()
 
 
-def resolve_transformers_fallback(model_config: ModelConfig,
-                                  architectures: list[str]):
+def resolve_transformers_arch(model_config: ModelConfig,
+                              architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersModel":
+        if arch == "TransformersForCausalLM":
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -58,7 +55,10 @@ def resolve_transformers_fallback(model_config: ModelConfig,
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_class_from_dynamic_module(module, model_config.model)
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
         }
         custom_model_module = auto_modules.get("AutoModel")
@@ -69,7 +69,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
@@ -80,7 +80,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
     return architectures
 
 
@@ -100,12 +100,11 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_vllm_supported = any(arch in vllm_supported_archs
-                            for arch in architectures)
-    if (not is_vllm_supported
-            or model_config.model_impl == ModelImpl.TRANSFORMERS):
-        architectures = resolve_transformers_fallback(model_config,
-                                                      architectures)
+    vllm_not_supported = not any(arch in vllm_supported_archs
+                                 for arch in architectures)
+    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
+            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
+        architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a7475941c1278..1bb592f492ef2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -658,8 +658,21 @@ def initialize_dummy_weights(
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
             if current_platform.is_tpu():
-                # XLA device does not support torch.Generator()
-                param.uniform_(low, high)
+                generator = torch.Generator(device="cpu")
+                generator.manual_seed(seed)
+                # Note: The param.uniform_ function cannot be used in this
+                # context because it demands more TPU HBM than directly copying
+                # from a CPU tensor.
+                # Note: We avoid using torch.rank_like as it doesn't currently
+                # support the generator argument.
+                param.copy_((high - low) *
+                            torch.rand(*param.shape,
+                                       generator=generator,
+                                       dtype=param.dtype,
+                                       layout=param.layout,
+                                       requires_grad=param.requires_grad,
+                                       device="cpu") + low)
+                torch._sync(param)
                 continue
 
             generator = torch.Generator(device=param.data.device)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 23d72d8e60f60..6ab03c40ab4a2 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -99,16 +99,17 @@ def _create_pooling_model_cls(
                     mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
                     weights = mapper.apply(weights)
 
-                    self.model.load_weights(weights)
-                    return
+                    loaded_params = self.model.load_weights(weights)
+                    loaded_params = {f"model.{name}" for name in loaded_params}
+                    return loaded_params
 
             # For most other models
             if hasattr(orig_cls, "load_weights"):
-                orig_cls.load_weights(self, weights)  # type: ignore
+                return orig_cls.load_weights(self, weights)  # type: ignore
             # Fallback
             else:
                 loader = AutoWeightsLoader(self)
-                loader.load_weights(weights)
+                return loader.load_weights(weights)
 
     return ModelForPooling  # type: ignore
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8cd3be90ca8da..edf67c860e977 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -21,12 +21,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 # yapf: disable
@@ -408,13 +409,6 @@ class AriaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
@@ -422,31 +416,31 @@ class AriaProcessingInfo(BaseProcessingInfo):
 
 class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token: str = processor.tokenizer.image_token  # type: ignore
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.tokenizer.image_token  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
@@ -605,6 +599,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
new file mode 100644
index 0000000000000..8700f24d2bd25
--- /dev/null
+++ b/vllm/model_executor/models/aya_vision.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0 Adapted from
+# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
+from functools import cached_property
+from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
+                    TypedDict, Union, cast)
+
+import torch
+from torch import nn
+from transformers import BatchFeature, GotOcr2ImageProcessor
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import get_size_dict
+from transformers.models.aya_vision import AyaVisionConfig
+from transformers.models.aya_vision.processing_aya_vision import (
+    AyaVisionProcessor)
+from transformers.models.got_ocr2.image_processing_got_ocr2 import (
+    get_optimal_tiled_canvas)
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+
+class AyaVisionImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_patches_total, num_channels, height, width)`
+
+    `num_patches_total` is the total number of patches over each image over each
+    prompt in the batch.
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size",
+            config.text_config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.vision_config.hidden_size *
+                                      (config.downsample_factor**2),
+                                      eps=config.adapter_layer_norm_eps)
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self,
+                      image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
+        batch_size, seq_length, _ = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width,
+                                                height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor),
+            int(channels * self.downsample_factor))
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor),
+            int(width / self.downsample_factor), -1)
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+class AyaVisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> AyaVisionConfig:
+        return self.ctx.get_hf_config(AyaVisionConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+    def get_image_processor(self) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        height = image_processor.size['height']
+        width = image_processor.size['width']
+        max_patches = image_processor.max_patches
+        return ImageSize(height=height * max_patches,
+                         width=width * max_patches)
+
+    def get_num_patches(self, *, image_width: int, image_height: int,
+                        size: dict, min_patches: int, max_patches: int) -> int:
+        """
+        Calculate the number of patches needed for a given image based on size
+        constraints.  This method replicates and adjusts the logic from:
+        transformers/models/got_ocr2/image_processing_got_ocr2
+        """
+        size = get_size_dict(size, default_to_square=False)
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width), (size["height"], size["width"]),
+            min_patches, max_patches)
+        num_blocks = num_columns * num_rows
+        return num_blocks if num_blocks == 1 else num_blocks + 1
+
+
+class AyaVisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        image_size = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=image_size.width,
+                                   height=image_size.height,
+                                   num_images=num_images)
+        }
+
+
+class AyaVisionMultiModalProcessor(
+        BaseMultiModalProcessor[AyaVisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = hf_processor.image_processor
+
+        # HF processor pops the `num_patches` kwarg, which is needed by vLLM
+        if (images :=
+                mm_data.get("images")) is not None and '<image>' in prompt:
+            assert isinstance(images, list)
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+
+            num_patches = [
+                self.info.get_num_patches(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    size=image_processor.size,
+                    min_patches=image_processor.min_patches,
+                    max_patches=image_processor.max_patches)
+                for image_size in image_sizes
+            ]
+            processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
+        image_processor = hf_processor.image_processor
+
+        def get_replacement(item_idx: int):
+            images: ImageProcessorItems = mm_items.get("image",
+                                                       ImageProcessorItems)
+            image_size: ImageSize = images.get_image_size(item_idx)
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                size=image_processor.size,
+                min_patches=image_processor.min_patches,
+                max_patches=image_processor.max_patches,
+            )
+            repl = hf_processor._prompt_split_image(num_patches=num_patches)
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest m
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AyaVisionMultiModalProcessor,
+    info=AyaVisionProcessingInfo,
+    dummy_inputs=AyaVisionDummyInputsBuilder)
+class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AyaVisionConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        num_hidden_layers = _get_num_hidden_layers(config)
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.vocab_size = config.text_config.vocab_size
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "model"),
+            # Cohere2ForCausalLM and CohereForCausalLM are the same on vllm
+            architectures=["Cohere2ForCausalLM"])
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor,
+                                  **kwargs) -> torch.Tensor:
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype),
+                                      **kwargs)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _process_image_input(self, image_input: AyaVisionImagePixelInputs,
+                             **kwargs) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+        image_features = self._image_pixels_to_features(
+            self.vision_tower, pixel_values=pixel_values)
+        image_embeds = self.multi_modal_projector(image_features)
+        return [
+            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            if d.shape != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Aya Vision does not support image_embeds."
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+        if num_patches is not None and not isinstance(num_patches,
+                                                      (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_patches. "
+                             f"Got type: {type(num_patches)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        num_patches = flatten_bn(num_patches, concat=True)
+
+        return AyaVisionImagePixelInputs(
+            type="pixel_values",
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_patches=num_patches,
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        return self._process_image_input(image_input, **kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_index,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 7e2b7c862e5f9..6a3112b5f7695 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -47,7 +47,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -321,6 +321,45 @@ class BaiChuanModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
                               SupportsQuant):
@@ -353,6 +392,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
+        self.lm_head.weight.weight_loader = self.lm_head_weight_loader
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
@@ -393,53 +433,22 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name == "lm_head.weight":
-                # Unlike Baichuan, Baichuan2 normalizes the head weights.
-                # Refer to:
-                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
-                # Distinguish between Baichuan and Baichuan2 by checking the
-                # vocab size. This is suggested by
-                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
-                is_baichuan2 = self.config.vocab_size == 125696
-                if is_baichuan2:
-                    loaded_weight = torch.nn.functional.normalize(
-                        loaded_weight)
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+    def lm_head_weight_loader(self, param: nn.Parameter,
+                              loaded_weight: torch.Tensor):
+        # Unlike Baichuan, Baichuan2 normalizes the head weights.
+        # Refer to:
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+        # Distinguish between Baichuan and Baichuan2 by checking the
+        # vocab size. This is suggested by
+        # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+        is_baichuan2 = self.config.vocab_size == 125696
+        if is_baichuan2:
+            loaded_weight = torch.nn.functional.normalize(loaded_weight)
+
+        default_weight_loader(param, loaded_weight)
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index de0209d0b43b1..dfb8f49cc0145 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -18,6 +18,8 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -34,7 +36,7 @@ from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
                          SupportsQuant, SupportsV0Only)
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -94,7 +96,6 @@ class BambaMixerDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                chunk_size=config.mamba_chunk_size,
                                 quant_config=quant_config)
 
         self.feed_forward = BambaMLP(config, quant_config=quant_config)
@@ -108,7 +109,7 @@ class BambaMixerDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
         **kwargs,
     ):
         if residual is None:
@@ -119,7 +120,7 @@ class BambaMixerDecoderLayer(nn.Module):
                 hidden_states, residual)
 
         hidden_states = self.mamba(hidden_states, mamba_cache_params,
-                                   sequence_idx)
+                                   mamba2_metadata)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
             hidden_states, residual)
@@ -259,7 +260,7 @@ class BambaModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
+        config: BambaConfig = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -309,20 +310,13 @@ class BambaModel(nn.Module):
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.mamba_chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -352,7 +346,7 @@ class BambaModel(nn.Module):
                 hidden_states=hidden_states,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params,
-                sequence_idx=seq_idx,
+                mamba2_metadata=mamba2_metadata,
             )
 
         if not get_pp_group().is_last_rank:
@@ -363,6 +357,58 @@ class BambaModel(nn.Module):
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                        IsHybrid, SupportsV0Only, SupportsQuant):
@@ -502,52 +548,5 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            if ".self_attn." in name:
-                name = name.replace(".self_attn", "")
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 77b2ef0fce5f4..76a529c93343f 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -11,13 +11,16 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
+                                                   get_act_fn)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
                                                PoolingType)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -26,7 +29,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -38,19 +41,24 @@ class BertEmbedding(nn.Module):
         self.size = config.hidden_size
         self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
                                                       config.hidden_size)
-        self.position_embeddings = VocabParallelEmbedding(
-            config.max_position_embeddings, config.hidden_size)
+
         self.token_type_embeddings = VocabParallelEmbedding(
             config.type_vocab_size, config.hidden_size)
         self.LayerNorm = nn.LayerNorm(config.hidden_size,
                                       eps=config.layer_norm_eps)
-        self.position_ids = nn.Parameter(
-            torch.empty((1, config.max_position_embeddings)), )
 
         self.position_embedding_type = config.position_embedding_type
-        if self.position_embedding_type != "absolute":
-            raise ValueError("Only 'absolute' position_embedding_type" +
-                             " is supported")
+        if self.position_embedding_type == "absolute":
+            self.position_embeddings = VocabParallelEmbedding(
+                config.max_position_embeddings, config.hidden_size)
+            self.position_ids = nn.Parameter(
+                torch.empty((1, config.max_position_embeddings)), )
+        elif self.position_embedding_type == "rotary":
+            self.position_embeddings = None
+            self.position_ids = None
+        else:
+            raise ValueError("Only 'absolute' and 'rotary' " +
+                             "position_embedding_type is supported")
 
     def forward(
         self,
@@ -64,9 +72,6 @@ class BertEmbedding(nn.Module):
         # Input embeddings.
         inputs_embeds = self.word_embeddings(input_ids)
 
-        # Position embeddings.
-        position_embeddings = self.position_embeddings(position_ids)
-
         if token_type_ids is None:
             token_type_ids = torch.zeros(input_shape,
                                          dtype=torch.long,
@@ -74,7 +79,12 @@ class BertEmbedding(nn.Module):
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = inputs_embeds + token_type_embeddings
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
         embeddings = self.LayerNorm(embeddings)
         return embeddings
 
@@ -98,7 +108,11 @@ class BertPooler(nn.Module):
 @support_torch_compile
 class BertEncoder(nn.Module):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 bias: bool = True,
+                 rotary_kwargs: Optional[dict] = None,
+                 prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -107,16 +121,19 @@ class BertEncoder(nn.Module):
             BertLayer(config=config,
                       cache_config=cache_config,
                       quant_config=quant_config,
+                      bias=bias,
+                      rotary_kwargs=rotary_kwargs,
                       prefix=f"{prefix}.layer.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
         ])
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         for layer in self.layer:
-            hidden_states = layer(hidden_states)
+            hidden_states = layer(positions, hidden_states)
         return hidden_states
 
 
@@ -126,6 +143,8 @@ class BertLayer(nn.Module):
                  config: BertConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = True,
+                 rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
 
@@ -135,23 +154,36 @@ class BertLayer(nn.Module):
             layer_norm_eps=config.layer_norm_eps,
             cache_config=cache_config,
             quant_config=quant_config,
+            bias=bias,
+            rotary_kwargs=rotary_kwargs,
             prefix=f"{prefix}.attention")
 
-        self.intermediate = BertIntermediate(
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-            prefix=f"{prefix}.intermediate")
+        if config.hidden_act in ["silu", "gelu_and_mul"]:
+            self.intermediate = BertGatedIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
+        else:
+            self.intermediate = BertIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
 
         self.output = BertOutput(hidden_size=config.hidden_size,
                                  intermediate_size=config.intermediate_size,
                                  layer_norm_eps=config.layer_norm_eps,
+                                 bias=bias,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.output")
 
-    def forward(self, hidden_states: torch.Tensor):
-        attn_output = self.attention(hidden_states)
+    def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
+        attn_output = self.attention(positions, hidden_states)
         intermediate_output = self.intermediate(attn_output)
         output = self.output(intermediate_output, attn_output)
         return output
@@ -166,6 +198,8 @@ class BertAttention(nn.Module):
         layer_norm_eps: float,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
+        rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -174,18 +208,22 @@ class BertAttention(nn.Module):
                                       num_attention_heads=num_attention_heads,
                                       cache_config=cache_config,
                                       quant_config=quant_config,
+                                      bias=bias,
+                                      rotary_kwargs=rotary_kwargs,
                                       prefix=f"{prefix}.output")
 
         self.output = BertSelfOutput(hidden_size=hidden_size,
                                      layer_norm_eps=layer_norm_eps,
+                                     bias=bias,
                                      quant_config=quant_config,
                                      prefix=f"{prefix}.output")
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        self_output = self.self(hidden_states)
+        self_output = self.self(positions, hidden_states)
         return self.output(self_output, hidden_states)
 
 
@@ -197,6 +235,8 @@ class BertSelfAttention(nn.Module):
         num_attention_heads: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
+        rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -221,10 +261,15 @@ class BertSelfAttention(nn.Module):
             head_size=self.head_dim,
             total_num_heads=self.total_num_heads,
             total_num_kv_heads=self.total_num_kv_heads,
-            bias=True,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj")
 
+        if rotary_kwargs:
+            self.rotary_emb = get_rope(**rotary_kwargs)
+        else:
+            self.rotary_emb = None
+
         self.attn = Attention(num_heads=self.num_heads,
                               head_size=self.head_dim,
                               scale=self.scaling,
@@ -236,10 +281,15 @@ class BertSelfAttention(nn.Module):
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb:
+            q, k = self.rotary_emb(positions, q, k)
+
         output = self.attn(q, k, v)
         return output
 
@@ -249,12 +299,13 @@ class BertSelfOutput(nn.Module):
     def __init__(self,
                  hidden_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = RowParallelLinear(input_size=hidden_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
         self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
@@ -272,12 +323,13 @@ class BertIntermediate(nn.Module):
                  hidden_size: int,
                  intermediate_size: int,
                  hidden_act: str,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = ColumnParallelLinear(input_size=hidden_size,
                                           output_size=intermediate_size,
-                                          bias=True,
+                                          bias=bias,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.dense")
         self.intermediate_act_fn = get_act_fn(hidden_act)
@@ -288,19 +340,46 @@ class BertIntermediate(nn.Module):
         return hidden_states
 
 
+class BertGatedIntermediate(nn.Module):
+    # for NomciBert and GteModel
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 bias: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.act_fn = get_act_and_mul_fn(hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        return hidden_states
+
+
 class BertOutput(nn.Module):
 
     def __init__(self,
                  hidden_size: int,
                  intermediate_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
 
         self.dense = RowParallelLinear(input_size=intermediate_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
 
@@ -313,18 +392,34 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
-class BertModel(nn.Module):
+class BertModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": ["query", "key", "value"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self,
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
                  embedding_class: type = BertEmbedding,
+                 bias: bool = True,
+                 rotary_kwargs: Optional[dict] = None,
                  add_pooling_layer: bool = False):
         super().__init__()
+        """
+        For BertModel, all linear layers have bias.
+        For NomicBertModel, all linear layers do not have bias.
+        """
+
         config = vllm_config.model_config.hf_config
         self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(vllm_config=vllm_config,
+                                   bias=bias,
+                                   rotary_kwargs=rotary_kwargs,
                                    prefix=f"{prefix}.encoder")
         self.pooler = BertPooler(config) if add_pooling_layer else None
 
@@ -346,7 +441,7 @@ class BertModel(nn.Module):
                 seq_lens=attn_metadata.seq_lens_tensor,
                 position_ids=position_ids,
                 token_type_ids=token_type_ids)
-        return self.encoder(hidden_states)
+        return self.encoder(position_ids, hidden_states)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
@@ -355,6 +450,8 @@ class BertModel(nn.Module):
             ("qkv_proj", "query", "q"),
             ("qkv_proj", "key", "k"),
             ("qkv_proj", "value", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
         ]
 
         params_dict = dict(self.named_parameters())
@@ -385,7 +482,7 @@ class BertModel(nn.Module):
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only):
+class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -400,6 +497,7 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         pooler_config = vllm_config.model_config.pooler_config
+        self.config = vllm_config.model_config.hf_config
         self.model = self._build_model(vllm_config=vllm_config,
                                        prefix=maybe_prefix(prefix, "model"))
         self._pooler = self._build_pooler(pooler_config)
@@ -443,7 +541,8 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only):
                                                 softmax=False)
 
 
-class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                    SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -512,3 +611,115 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
                          inputs_embeds=inputs_embeds,
                          intermediate_tensors=intermediate_tensors,
                          token_type_ids=token_type_ids)
+
+
+class NomicBertEmbeddingModel(BertEmbeddingModel):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "emb_ln": "embeddings.LayerNorm",
+            "layers": "layer",
+            "attn.Wqkv": "attention.self.qkv_proj",
+            "attn.out_proj": "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc11': "intermediate.up_proj",
+            'mlp.fc12': "intermediate.gate_proj",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "NomicBertConfig"
+        assert config.activation_function == "swiglu"
+
+        # Assume NomicBertModel all linear layers do not have bias
+        assert not config.mlp_fc1_bias
+        assert not config.mlp_fc2_bias
+        assert not config.qkv_proj_bias
+
+        config.layer_norm_eps = config.layer_norm_epsilon
+        config.position_embedding_type = "rotary"
+        config.intermediate_size = config.n_inner
+        config.hidden_act = "silu"
+        config.hidden_size = config.n_embd
+        config.num_hidden_layers = config.n_layer
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_trained_positions,
+            "base": config.rotary_emb_base,
+            "rope_scaling": {
+                "rope_type": "dynamic",
+                "factor": config.rotary_scaling_factor
+            }
+        }
+
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         bias=False,
+                         rotary_kwargs=rotary_kwargs,
+                         embedding_class=BertEmbedding)
+
+
+class GteEmbeddingModel(BertEmbeddingModel):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attention.qkv_proj": "attention.self.qkv_proj",
+            "attention.o_proj": "attention.output.dense",
+            'attn_ln': "attention.output.LayerNorm",
+            'mlp.down_proj': "output.dense",
+            'mlp_ln': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "GteConfig"
+        assert config.position_embedding_type == "rope"
+        assert config.hidden_act == "gelu"
+
+        config.position_embedding_type = "rotary"
+        config.hidden_act = "gelu_and_mul"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_position_embeddings,
+            "base": config.rope_theta,
+        }
+
+        model = BertModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          rotary_kwargs=rotary_kwargs,
+                          embedding_class=BertEmbedding)
+
+        # GteModel only gate_up_proj does not have bias.
+        # Hack method learned from vllm/model_executor/models/glm.py
+        for layer in model.encoder.layer:
+            layer.intermediate.gate_up_proj.bias = None
+            layer.intermediate.skip_bias_add = True
+        return model
+
+    def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        n = "mlp.up_gate_proj"
+        for name, weight in weights:
+            if n in name:
+                up, gate = weight.chunk(2, dim=0)
+                yield name.replace(n, "intermediate.up_proj"), up
+                yield name.replace(n, "intermediate.gate_proj"), gate
+            else:
+                yield name, weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        weights = self.split_up_gate_proj(weights)
+        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index bedbdceb7721d..f3d488926d09e 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -16,6 +16,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from .interfaces import SupportsQuant
+
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -243,9 +245,10 @@ class BlipEncoder(nn.Module):
         return hidden_states
 
 
-class BlipVisionModel(nn.Module):
+class BlipVisionModel(nn.Module, SupportsQuant):
     config_class = BlipVisionConfig
     main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7adca4f0dc868..a6f00f9997730 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,16 +15,18 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -405,13 +407,6 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         return hf_config.num_query_tokens
@@ -419,29 +414,27 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
 
 class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
@@ -498,7 +491,8 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
                                         info=Blip2ProcessingInfo,
                                         dummy_inputs=Blip2DummyInputsBuilder)
-class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                    SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
@@ -625,6 +619,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
         return self.language_projection(query_output)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 50f48f91798ac..f960075b98bca 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP, SupportsV0Only
+from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -279,7 +279,7 @@ class BloomModel(nn.Module):
         return hidden_states
 
 
-class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only):
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index ebcd36148e073..0ad5e89df2e25 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,15 +30,17 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -63,13 +65,6 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         processor = self.get_hf_processor()
         return processor.image_seq_length
@@ -78,28 +73,31 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
 class ChameleonDummyInputsBuilder(
         BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=width,
                                    height=height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
@@ -161,9 +159,9 @@ class ChameleonMultiModalProcessor(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails(
-                    full=([image_start_id] + image_tokens + [image_end_id]),
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
@@ -927,7 +925,11 @@ class ChameleonModel(nn.Module):
     info=ChameleonProcessingInfo,
     dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+                                        SupportsPP, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -983,6 +985,9 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
             data=self._validate_pixel_values(pixel_values),
         )
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 14dca23b3934f..1b1738f882b7f 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -10,6 +10,7 @@ from torch import nn
 from torch.nn import LayerNorm
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -28,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -293,7 +294,12 @@ class GLMTransformer(nn.Module):
         return hidden_states
 
 
-class ChatGLMModel(nn.Module):
+@support_torch_compile
+class ChatGLMModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {
+        "linear_proj.merged_proj":
+        ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -393,7 +399,6 @@ class ChatGLMModel(nn.Module):
 
 
 class ChatGLMBaseModel(nn.Module):
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={".word_embeddings": ""}, )
 
@@ -450,7 +455,8 @@ class ChatGLMBaseModel(nn.Module):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                         SupportsQuant):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index dc3aa9cbe86b7..153054e5c028b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -30,9 +30,6 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
     ) -> int:
         return self.get_patch_grid_length()**2 + 1
 
-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2 + 1
-
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index b0cb4a62333a4..bb8d9bf8a03c5 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -24,7 +24,6 @@
 from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import CohereConfig
 
@@ -50,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -333,7 +332,7 @@ class CohereModel(nn.Module):
         return hidden_states
 
 
-class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
new file mode 100644
index 0000000000000..d073a7de69178
--- /dev/null
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+
+
+class ConstantSizeCache(ABC):
+    """
+    Abstract base class for managing constant size caches 
+    like Mamba and Minimax.
+    """
+
+    def __init__(self, max_batch_size: int):
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the cache
+        self.cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.free_cache_indices = list(range(max_batch_size))
+
+    @property
+    @abstractmethod
+    def cache(self) -> Any:
+        """Return the underlying cache tensor(s)"""
+        pass
+
+    @abstractmethod
+    def _copy_cache(self, from_index: int, to_index: int):
+        """Copy cache data from one index to another"""
+        pass
+
+    def current_run_tensors(self, **kwargs) -> Tuple:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            state_indices = self._prepare_current_run_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+            state_indices_tensor = torch.as_tensor(state_indices,
+                                                   dtype=torch.int32,
+                                                   device="cuda")
+            cache_tensors = self.cache
+        else:
+            # CUDA graph capturing runs
+            cache_tensors, state_indices_tensor = kwargs[
+                "seqlen_agnostic_capture_inputs"]
+
+        return (cache_tensors, state_indices_tensor)
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant state_indices into the CUDA graph input buffer 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        assert "seqlen_agnostic_capture_inputs" in input_buffers
+        _, input_state_indices_buffer = input_buffers[
+            "seqlen_agnostic_capture_inputs"]
+
+        self._release_finished_requests(finished_requests_ids)
+        state_indices = self._prepare_current_run_cache(
+            request_ids_to_seq_ids, finished_requests_ids)
+        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
+            state_indices)
+        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
+
+        input_state_indices_buffer.copy_(
+            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Cache during the CUDA graph replay
+        runs.
+        """
+        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                               dtype=torch.int32,
+                                               device="cuda")
+        return (self.cache, state_indices_tensor)
+
+    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
+                                      finished_requests_ids) -> int:
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        if cur_rid in finished_requests_ids:
+            # set as pad, do not allocate destination index
+            return PAD_SLOT_ID
+        elif cur_rid not in self.cache_indices_mapping:
+            destination_index = self.free_cache_indices.pop()
+            self.cache_indices_mapping[cur_rid] = {seq_id: destination_index}
+            return destination_index
+        elif seq_id not in (seq_ids2indices :=
+                            self.cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened, so we copy the
+            # existing cache into the siblings seq_ids caches
+            index_exists = next(iter(seq_ids2indices.values()))
+            # case of decoding n>1, copy prefill cache to decoding indices
+            destination_index = self.free_cache_indices.pop()
+            self._copy_cache(from_index=index_exists,
+                             to_index=destination_index)
+            self.cache_indices_mapping[cur_rid][seq_id] = destination_index
+            return destination_index
+        else:
+            return self.cache_indices_mapping[cur_rid][seq_id]
+
+    def _prepare_current_run_cache(
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            finished_requests_ids: List[str]) -> List[int]:
+        return [
+            self._assign_seq_id_to_cache_index(req_id, seq_id,
+                                               finished_requests_ids)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.cache_indices_mapping:
+                for seq_id in self.cache_indices_mapping[req_id]:
+                    self.free_cache_indices.append(
+                        self.cache_indices_mapping[req_id][seq_id])
+                self.cache_indices_mapping.pop(req_id)
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
deleted file mode 100644
index b239b642f752b..0000000000000
--- a/vllm/model_executor/models/decilm.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 DeciAI Research Team. All rights reserved.
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on MistralAI GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only DeciLM model compatible with HuggingFace weights."""
-
-from typing import Iterable, Set, Tuple
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import LlamaForCausalLM
-
-from .utils import is_pp_missing_parameter
-
-
-class DeciLMForCausalLM(LlamaForCausalLM):
-    """
-    Implementation for https://huggingface.co/Deci/DeciLM-7b-instruct.
-    Based on the llama executor.
-
-    The main difference is that DeciLM uses Variable Grouped Query Attention.
-    The constant number of GQA heads in the decoder is overridden with a value
-    per layer.
-
-    Usually, in the HuggingFace implementation, instead of
-    "config.num_key_value_heads", we use
-    "config.num_key_value_heads_per_layer[i]" which varies.
-
-    Currently, PagedAttention does not work well with variable GQA, so we
-    normalize the weights upon loading, and use uniform GQA with the max value
-    instead.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_config
-        config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
-        delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(vllm_config=vllm_config)
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "k_proj" in name or "v_proj" in name:
-                loaded_weight = self._degroup_weight(loaded_weight)
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-    def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
-        hidden_size = self.config.hidden_size
-        head_size = self.config.hidden_size // self.config.num_attention_heads
-        target_num_kv_heads = self.config.num_key_value_heads
-        num_kv_heads = loaded_weight.shape[0] // head_size
-        n_repeats = target_num_kv_heads / num_kv_heads
-        assert n_repeats == int(n_repeats)
-
-        n_repeats = int(n_repeats)
-        loaded_weight = loaded_weight.view(num_kv_heads, head_size,
-                                           hidden_size)
-        loaded_weight = torch.repeat_interleave(loaded_weight,
-                                                repeats=n_repeats,
-                                                dim=0)
-        loaded_weight = loaded_weight.reshape(target_num_kv_heads * head_size,
-                                              hidden_size)
-
-        return loaded_weight
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index f0212f37657aa..5e036d049a8a5 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -51,7 +51,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -385,6 +386,56 @@ class DeepseekModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class DeepseekForCausalLM(nn.Module, SupportsPP):
 
@@ -439,50 +490,5 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fcab533ed2dc5..23b450aeddac9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -160,14 +160,16 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states=hidden_states,
                 router_logits=router_logits) * self.routed_scaling_factor
         else:
-            # This is a special case to avoid FP16 overflow
+            # Fix FP16 overflow
+            # See DeepseekV2DecoderLayer for more details.
             final_hidden_states = self.experts(hidden_states=hidden_states,
                                                router_logits=router_logits)
         if shared_output is not None:
             if hidden_states.dtype != torch.float16:
                 final_hidden_states = final_hidden_states + shared_output
             else:
-                # This is a special case to avoid FP16 overflow
+                # Fix FP16 overflow
+                # See DeepseekV2DecoderLayer for more details.
                 final_hidden_states = final_hidden_states + shared_output \
                     * (1. / self.routed_scaling_factor)
         if self.tp_size > 1:
@@ -499,6 +501,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
         if model_config.use_mla:
             attn_cls = DeepseekV2MLAAttention
         else:
@@ -561,19 +564,30 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states=hidden_states,
         )
 
-        # Fully Connected
-        if isinstance(self.mlp, DeepseekV2MoE) and \
-            hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+        if hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
             hidden_states *= 1. / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1. / self.routed_scaling_factor
+
+        # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
-        if isinstance(self.mlp, DeepseekV2MLP) and \
-            hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+
+        if isinstance(self.mlp,
+                      DeepseekV2MLP) and hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # Scaling the DeepseekV2MLP output, it is the input of
+            # input_layernorm of next decoder layer.
+            # The scaling of DeepseekV2MOE output would be done in the forward
+            # of DeepseekV2MOE
             hidden_states *= 1. / self.routed_scaling_factor
-            residual *= 1. / self.routed_scaling_factor
+
         return hidden_states, residual
 
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0faf895964bb6..c3dbadb292769 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -19,14 +19,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
                                                           MlpProjectorConfig,
@@ -168,47 +168,34 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
                                 image_width=x[1], image_height=x[0]))
         return ImageSize(width=width, height=height)
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        num_images = mm_counts.get("image", 0)
-        max_image_size = self.get_image_size_with_most_features()
-        max_image_tokens = self.get_num_image_tokens(
-            image_height=max_image_size.height,
-            image_width=max_image_size.width,
-            cropping=num_images <= 2)
-
-        return {"image": max_image_tokens}
-
 
 class DeepseekVL2DummyInputsBuilder(
         BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token
 
         max_image_size = self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size.width,
                                    height=max_image_size.height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
@@ -509,7 +496,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         _, hw, n_dim = images_embeds.shape
         h = w = int(hw**0.5)
 
-        # 根据self.tile_tag & self.global_view_pos填充image token sequence
+        # fill image token based on self.tile_tag & self.global_view_pos
         tile_index = 0
         vision_embeddings = []
         for jdx in range(images_spatial_crop.size(0)):
@@ -604,6 +591,9 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 010e51a3b9f28..3e4a5040b7c89 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -59,7 +60,15 @@ class EAGLE(nn.Module):
        truncated_vocab_size < vocab_size. To use this technique, one has to find
        the top-k most frequent tokens in target dataset and add that as a tensor
        in the draft checkpoint (using key token_map). Also, the draft config
-       needs to have truncated_vocab_size (=k) as an attribute."""
+       needs to have truncated_vocab_size (=k) as an attribute.
+    4. We allow an enhanced EAGLE architecture similar to the DeepSeek MTP 
+       module with regards to the use of additional RMS norms. The original 
+       EAGLE architecture 1) skips the pre-attention norm in its first 
+       transformer block, and 2) skips the final output norm, both of which we 
+       found to be suboptimal. We also add the support for separate norms
+       applying to both the token embedding and hidden states before projection
+       as in DeepSeek MTP, which we found to improve performance as well.
+    """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -81,9 +90,22 @@ class EAGLE(nn.Module):
         # While weights and biases are generally not needed,
         # they are retained here to support certain unit tests
         # (e.g., spec_decode/e2e/test_eagle_correctness.py).
-        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
-            weight=self.model.model.layers[0].input_layernorm.weight)
-        self.model.model.norm = DummyOutputNorm()
+        if not hasattr(self.config.model,
+                       "skip_prenorm") or self.config.model.skip_prenorm:
+            self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
+                weight=self.model.model.layers[0].input_layernorm.weight)
+
+        if not hasattr(
+                self.config.model,
+                "skip_output_norm") or self.config.model.skip_output_norm:
+            self.model.model.norm = DummyOutputNorm()
+
+        self.add_para_norm = False
+        if hasattr(self.config.model,
+                   "add_para_norm") and self.config.model.add_para_norm:
+            self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.add_para_norm = True
 
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
@@ -128,8 +150,17 @@ class EAGLE(nn.Module):
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings(input_ids)
 
-        inputs_embeds = self.fc(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+        if self.add_para_norm:
+            inputs_embeds = torch.cat([
+                self.enorm(inputs_embeds),
+                self.hnorm(previous_hidden_states)
+            ],
+                                      dim=-1)
+        else:
+            inputs_embeds = torch.cat([inputs_embeds, previous_hidden_states],
+                                      dim=-1)
+
+        inputs_embeds = self.fc(inputs_embeds)
 
         inputs_embeds[positions == 0] = 0  # masking inputs at position=0
 
@@ -190,6 +221,14 @@ class EAGLE(nn.Module):
                 else:
                     logger.warning_once("Found bias in the loaded weights but "
                                         "the model config doesn't have bias.")
+            elif name.startswith("enorm.weight"):
+                weight_loader = getattr(self.enorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.enorm.weight, loaded_weight)
+            elif name.startswith("hnorm.weight"):
+                weight_loader = getattr(self.hnorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.hnorm.weight, loaded_weight)
             elif name.startswith("model.lm_head.") or name.startswith(
                     "model.model."):
                 model_weights[name.split("model.", 1)[-1]] = loaded_weight
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 7d01dd37826a0..553c524ebc370 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -51,7 +51,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +313,7 @@ class ExaoneModel(nn.Module):
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
@@ -384,6 +385,72 @@ class ExaoneModel(nn.Module):
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".c_fc_0", 0),
+            (".gate_up_proj", ".c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -481,71 +548,12 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".c_fc_0", 0),
-            (".gate_up_proj", ".c_fc_1", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
+        loader = AutoWeightsLoader(
+            self,
             # With tie_word_embeddings, we can skip lm_head.weight
             # The weight might appear unnecessarily in the files if the model is
             # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 7154ac2e6a5af..0e67b1ec94f69 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import RWConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -395,6 +395,54 @@ class FalconModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        total_num_heads = self.config.num_attention_heads
+        if self.config.new_decoder_architecture:
+            total_num_kv_heads = self.config.num_kv_heads
+        elif self.config.multi_query:
+            total_num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                loaded_weight_shape = loaded_weight.shape
+                if output_dim is not None:
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] +
+                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
+                         -1) + loaded_weight_shape[output_dim + 1:])
+                    wq = loaded_weight.narrow(
+                        output_dim + 1, 0,
+                        num_query_heads_per_kv_head).reshape(
+                            *loaded_weight_shape[:output_dim], -1,
+                            *loaded_weight_shape[output_dim + 1:])
+                    wk = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    wv = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head + 1,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class FalconForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -462,51 +510,9 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        total_num_heads = self.config.num_attention_heads
-        if self.config.new_decoder_architecture:
-            total_num_kv_heads = self.config.num_kv_heads
-        elif self.config.multi_query:
-            total_num_kv_heads = 1
-        else:
-            total_num_kv_heads = total_num_heads
-        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if name == "lm_head.weight" and self.tie_word_embeddings:
-                # Falcon uses tied embeddings except Falcon-11b.
-                continue
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            if "query_key_value" in name:
-                output_dim = getattr(param, "output_dim", None)
-                loaded_weight_shape = loaded_weight.shape
-                if output_dim is not None:
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] +
-                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
-                         -1) + loaded_weight_shape[output_dim + 1:])
-                    wq = loaded_weight.narrow(
-                        output_dim + 1, 0,
-                        num_query_heads_per_kv_head).reshape(
-                            *loaded_weight_shape[:output_dim], -1,
-                            *loaded_weight_shape[output_dim + 1:])
-                    wk = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    wv = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head + 1,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 3883cd4460f50..359cc7f377310 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BartTokenizer, BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -21,13 +21,14 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                              BartScaledWordEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
-from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptIndexTargets, PromptInsertion,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
@@ -764,42 +765,33 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_max_image_tokens(self) -> int:
+    def get_num_image_tokens(self) -> int:
         processor_config = self.ctx.get_hf_image_processor_config()
         return processor_config["image_seq_length"]
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
 
 class Florence2DummyInputsBuilder(
         BaseDummyInputsBuilder[Florence2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width = target_height = self.info.get_hf_config().projection_dim
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class Florence2MultiModalProcessor(
         EncDecMultiModalProcessor[Florence2ProcessingInfo]):
@@ -826,6 +818,18 @@ class Florence2MultiModalProcessor(
     ) -> Union[str, list[int]]:
         return [self.info.get_hf_config().eos_token_id]
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        hf_processor = self.info.get_hf_processor()
+        tokenizer: BartTokenizer = hf_processor.tokenizer
+        prompt_text = tokenizer.decode(prompt_tokens)
+        # convert task tokens to prompt
+        prompt_text = hf_processor._construct_prompts([prompt_text])[0]
+        prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
+        return prompt_tokens
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -859,7 +863,7 @@ class Florence2MultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         pad_token_id = hf_config.pad_token_id
-        num_image_tokens = self.info.get_max_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [pad_token_id] * num_image_tokens
 
         return [
@@ -875,7 +879,8 @@ class Florence2MultiModalProcessor(
     Florence2MultiModalProcessor,
     info=Florence2ProcessingInfo,
     dummy_inputs=Florence2DummyInputsBuilder)
-class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -1037,6 +1042,9 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index bd7ef29e1f63f..27cd8d0986a55 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -31,13 +31,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -80,21 +81,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self.get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
-
     def get_image_feature_grid_size(
         self,
         *,
@@ -119,6 +105,19 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return ncols * nrows
+
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
@@ -127,27 +126,25 @@ class FuyuProcessingInfo(BaseProcessingInfo):
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
@@ -229,9 +226,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
-                features=image_tokens,
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
             )
 
         return [
@@ -325,15 +322,19 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         assert self.vision_embed_tokens is not None
         vision_embeddings_flat, _ = self.vision_embed_tokens(
             image_patches_flat)
+
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -343,8 +344,11 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                _IMAGE_TOKEN_ID)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                _IMAGE_TOKEN_ID,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index d741880c00d2d..92d99883c7743 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -319,6 +319,46 @@ class GemmaModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
 
 class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -385,44 +425,9 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # lm_head is not used in vllm as it is tied with embed_token.
-                # To prevent errors, skip loading lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 55c96f649fbeb..fb8eccc55078a 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -59,16 +59,23 @@ class Gemma3MLP(nn.Module):
         intermediate_size: int,
         hidden_activation: str,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_activation != "gelu_pytorch_tanh":
             raise ValueError(
                 "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
@@ -125,12 +132,14 @@ class Gemma3Attention(nn.Module):
             self.total_num_kv_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -293,6 +302,7 @@ class Gemma3DecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = GemmaRMSNorm(config.hidden_size,
                                             eps=config.rms_norm_eps)
@@ -344,6 +354,7 @@ class Gemma3Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 63d3ccbf54bc2..e5a3d6762fff2 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -15,8 +15,9 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 # yapf: disable
@@ -25,19 +26,17 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        encode_tokens, find_mm_placeholders,
+                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -55,14 +54,6 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -78,13 +69,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def _resolve_image_kwargs(
         self,
         processor: Gemma3Processor,
@@ -184,7 +168,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         if processor is None:
             processor = self.get_hf_processor()
 
-        image_token = processor.boi_token
+        boi_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -193,19 +177,21 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         )
 
         if num_crops == 0:
-            image_text = image_token
+            image_text = boi_token
         else:
-            crops_image_tokens = " ".join(image_token
-                                          for _ in range(num_crops))
+            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {image_token} and here are some "
+                f"Here is the original image {boi_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(image_token,
+        repl_full = image_text.replace(boi_token,
                                        processor.full_image_sequence)
-        repl_features = repl_full.strip("\n")
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        tokenizer = processor.tokenizer
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[tokenizer.image_token]
+
+        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
 
     def get_num_image_tokens(
         self,
@@ -214,19 +200,17 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
+        image_seq_len = processor.image_seq_length
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl.features,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return (num_crops + 1) * image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -238,43 +222,34 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         # Result in the max possible feature size (h:w = max_num_crops:1)
         return ImageSize(height=50 * max_num_crops, width=50)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
         processor = self.info.get_hf_processor()
         image_token = processor.boi_token
 
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
 
@@ -302,28 +277,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-            image_repl_features = [
-                self.info.get_image_repl(image_width=size.width,
-                                         image_height=size.height,
-                                         processor=hf_processor).features
-                for size in image_sizes
-            ]
-
-            tokenizer = self.info.get_tokenizer()
-            image_repls_feature_tokens = [
-                tokenizer.encode(image_repl, add_special_tokens=False)
-                for image_repl in image_repl_features
-            ]
-
-            vocab = tokenizer.get_vocab()
-            image_token_id = vocab[tokenizer.image_token]
-
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == image_token_id
-                for image_repl_tokens in image_repls_feature_tokens
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -345,7 +298,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -455,6 +407,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
+                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -573,7 +526,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -587,10 +539,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
 
@@ -598,7 +546,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
-            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -613,7 +560,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
@@ -625,7 +572,12 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         )
         image_embeds = self.multi_modal_projector(image_features)
 
-        return image_embeds.split(num_patches.tolist())
+        return [
+            e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -633,16 +585,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -654,7 +597,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
@@ -671,7 +614,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 8d52da8b7482c..6269ebcee5c08 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -3,13 +3,13 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
-from .interfaces import SupportsV0Only
 from .utils import PPMissingLayer
 
 
-class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
+class GlmForCausalLM(LlamaForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format GLM implementation
         # Attention difference between GLM and Llama:
@@ -17,7 +17,6 @@ class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
         # 2. There is no bias for o_proj in attention
         for layer in self.model.layers:
             if not isinstance(layer, PPMissingLayer):
-                layer.self_attn.rotary_emb.rotary_dim //= 2
                 layer.self_attn.rotary_emb.is_neox_style = False
                 layer.self_attn.o_proj.bias = None
                 layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
new file mode 100644
index 0000000000000..28cebfbd7baa0
--- /dev/null
+++ b/vllm/model_executor/models/glm4.py
@@ -0,0 +1,313 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 The Zhipu AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Glm4Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .llama import LlamaMLP as Glm4MLP
+from .llama import LlamaModel
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+
+class Glm4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Glm4Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[Tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.rotary_dim = self.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Glm4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Glm4Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.self_attn = Glm4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=AttentionType.DECODER,
+        )
+        self.mlp = Glm4MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size,
+                                          eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Glm4DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Glm4Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Glm4DecoderLayer)
+
+
+class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Glm4Model(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index c190a45855919..4e13716719ace 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -12,7 +12,7 @@ from torch import nn
 from torch.nn import LayerNorm
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import PreTrainedTokenizer, TensorType
+from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -28,13 +28,13 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BatchFeature,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
@@ -431,13 +431,6 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_feature_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
@@ -454,31 +447,31 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
 
 class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        return base_text * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
-
-        return ProcessorInputs(
-            prompt_text=base_text * num_images,
-            mm_data=mm_data,
-        )
-
 
 class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 
@@ -596,6 +589,9 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
 
         return self.transformer.vision(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 43f3d4f6dc9cc..40d01a2ecfc5b 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -244,6 +244,30 @@ class GPTBigCodeModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, 'q')
+                weight_loader(param, loaded_weight, 'k')
+                weight_loader(param, loaded_weight, 'v')
+            else:
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
@@ -315,26 +339,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
-                continue
-            if ".attn.bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
-            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
-                weight_loader(param, loaded_weight, 'q')
-                weight_loader(param, loaded_weight, 'k')
-                weight_loader(param, loaded_weight, 'v')
-            else:
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]),
+        )
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 752aec0b223dd..1f73d2ab0f1ae 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -188,6 +188,7 @@ class GPTJModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.config = config
+        self.quant_config = quant_config
         self.embed_dim = config.n_embd
         self.wte = VocabParallelEmbedding(
             config.vocab_size,
@@ -228,6 +229,63 @@ class GPTJModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "attn.bias" in name or "attn.masked_bias" in name:
+                continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTJForCausalLM(nn.Module, SupportsPP):
 
@@ -285,57 +343,5 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "attn.bias" in name or "attn.masked_bias" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 4b30c7bb30359..582b2ff7e755c 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -241,6 +241,45 @@ class GPTNeoXModel(nn.Module):
         hidden_states = self.final_layer_norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ("attention.bias" in name or "attention.masked_bias" in name
+                    or "rotary_emb.inv_freq" in name):
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using OpenRLHF may include
+                # these tensors in the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
@@ -297,39 +336,5 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if ("attention.bias" in name or "attention.masked_bias" in name
-                    or "rotary_emb.inv_freq" in name):
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using OpenRLHF may include
-                # these tensors in the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eba8207d2cd4a..3bd6332c11ca0 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -50,8 +50,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers,
-                    maybe_prefix)
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_layers, maybe_prefix)
 
 
 class GraniteMLP(nn.Module):
@@ -260,6 +260,7 @@ class GraniteModel(nn.Module):
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -321,6 +322,65 @@ class GraniteModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -428,71 +488,18 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
+        skip_prefixes = [
+            "rotary_emb.inv_freq",
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            "rotary_emb.cos_cached",
+            "rotary_emb.sin_cached",
         ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            # With tie_word_embeddings, we can skip lm_head.weight
-            # The weight might appear unnecessarily in the files if the model is
-            # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
+        # With tie_word_embeddings, we can skip lm_head.weight
+        # The weight might appear unnecessarily in the files if the model is
+        # processed with quantization, LoRA, fine-tuning, etc.
+        if self.config.tie_word_embeddings:
+            skip_prefixes.append("lm_head.weight")
 
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5152539c68f68..367722126e569 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors
 
 from . import mixtral
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import make_layers, maybe_prefix
+from .utils import AutoWeightsLoader, make_layers, maybe_prefix
 
 
 class GraniteMoeMoE(nn.Module):
@@ -252,6 +252,8 @@ class GraniteMoeModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -304,6 +306,40 @@ class GraniteMoeModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralModel.load_weights(self, new_weights.items())
+
 
 class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -331,7 +367,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.config = config
         self.lora_config = lora_config
-        self.quant_config = quant_config  # Required by MixtralForCausalLM
 
         self.model = GraniteMoeModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
@@ -403,37 +438,9 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        new_weights = {}
-        for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
-                for e in range(p.size(0)):
-                    w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
-                    w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
-                    w1_param, w3_param = p[e].chunk(2, dim=0)
-                    assert w1_name not in new_weights
-                    assert w3_name not in new_weights
-                    new_weights[w1_name] = w1_param
-                    new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
-                for e in range(p.size(0)):
-                    w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
-                    w2_param = p[e]
-                    assert w2_name not in new_weights
-                    new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
-                assert gate_name not in new_weights
-                new_weights[gate_name] = p
-            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
-                pass
-            else:
-                new_weights[n] = p
-        return mixtral.MixtralForCausalLM.load_weights(self,
-                                                       new_weights.items())
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 7e2e4cdcbfa36..cf8c969e118fe 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -29,7 +29,7 @@ from vllm.sequence import IntermediateTensors
 from . import mixtral
 from .granitemoe import GraniteMoeAttention, GraniteMoeMoE
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import make_layers, maybe_prefix
+from .utils import AutoWeightsLoader, make_layers, maybe_prefix
 
 
 class GraniteMoeSharedMLP(nn.Module):
@@ -152,6 +152,8 @@ class GraniteMoeSharedModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -207,6 +209,40 @@ class GraniteMoeSharedModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralModel.load_weights(self, new_weights.items())
+
 
 class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -234,7 +270,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.config = config
         self.lora_config = lora_config
-        self.quant_config = quant_config
 
         self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
                                            prefix=maybe_prefix(
@@ -307,37 +342,9 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        new_weights = {}
-        for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
-                for e in range(p.size(0)):
-                    w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
-                    w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
-                    w1_param, w3_param = p[e].chunk(2, dim=0)
-                    assert w1_name not in new_weights
-                    assert w3_name not in new_weights
-                    new_weights[w1_name] = w1_param
-                    new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
-                for e in range(p.size(0)):
-                    w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
-                    w2_param = p[e]
-                    assert w2_name not in new_weights
-                    new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
-                assert gate_name not in new_weights
-                new_weights[gate_name] = p
-            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
-                pass
-            else:
-                new_weights[n] = p
-        return mixtral.MixtralForCausalLM.load_weights(self,
-                                                       new_weights.items())
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 2984f22412864..e4692c4580887 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -170,7 +170,8 @@ class GritLMPooler(nn.Module):
         mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
             1)
 
-        pooled_data = self.head(mean_embeddings)
+        pooled_data = self.head(mean_embeddings,
+                                pooling_metadata=pooling_metadata)
 
         pooled_outputs = [
             PoolingSequenceGroupOutput(data) for data in pooled_data
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index f2e82017f6530..ef96257ba4bbc 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -48,7 +48,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -302,6 +302,8 @@ class Grok1Model(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -370,6 +372,105 @@ class Grok1Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Map Grok1's unique expert parameter names to standard names
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(self.config, "num_experts", 8)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="linear",  # Grok1 specific
+            ckpt_down_proj_name="linear_1",  # Grok1 specific
+            ckpt_up_proj_name="linear_v",  # Grok1 specific
+            num_experts=num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    # Handle Grok1-specific norm.scale naming
+                    if "norm.scale" in name:
+                        name = name.replace("scale", "weight")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -460,106 +561,10 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
+        skip_prefixes = ["rotary_emb.inv_freq"]
+        # Skip lm_head when tie_word_embeddings is True
+        if self.config.tie_word_embeddings:
+            skip_prefixes.append("lm_head")
 
-        # Map Grok1's unique expert parameter names to standard names
-        # Grok1 uses "num_experts" in its config
-        num_experts = getattr(self.config, "num_experts", 8)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="linear",  # Grok1 specific
-            ckpt_down_proj_name="linear_1",  # Grok1 specific
-            ckpt_up_proj_name="linear_v",  # Grok1 specific
-            num_experts=num_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    # Handle Grok1-specific norm.scale naming
-                    if "norm.scale" in name:
-                        name = name.replace("scale", "weight")
-
-                    # Skip lm_head when tie_word_embeddings is True
-                    if "lm_head" in name and self.config.tie_word_embeddings:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 3b2ad695f83ef..15e126b0f4cea 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
@@ -412,19 +412,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        max_tokens_one_image = self.get_max_image_tokens(use_msac=None)
-        if mm_counts.get("image", 0) <= 1:
-            max_tokens_per_image = max_tokens_one_image
-        else:
-            max_tokens_per_image = self.get_max_image_tokens(use_msac=False)
-
-        return {"image": max_tokens_per_image}
-
     def get_num_image_tokens(
         self,
         *,
@@ -442,16 +429,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
             use_msac=use_msac,
         )
 
-    def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-            use_msac=use_msac,
-        )
-
 
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 234e4498f163b..c31870461b4c2 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -17,16 +17,14 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
                           Idefics3Processor)
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -34,15 +32,18 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
-                                        MultiModalDataItems,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        MultiModalDataItems, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+# yapf: enable
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 # yapf: disable
@@ -54,17 +55,18 @@ from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 
 class Idefics3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_patches, 
              num_channels, height, width)`
     """
-    pixel_attention_mask: Optional[torch.BoolTensor]
+    pixel_attention_mask: torch.Tensor
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
 
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
@@ -95,37 +97,12 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        hf_processor = self.get_hf_processor()
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
-        grid_w, grid_h = self._get_image_feature_grid_size(
-            image_width=image_processor.size['longest_edge'],
-            image_height=image_processor.size['longest_edge'],
-        )
-        num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len
-        # Calculate Non-image-token length
-        # NOTE: <row_1_col_1> and <global-img> are special token for SmolVLM
-        # but not for Idefic3, so we need to tokenize them to get actual length.
-        tokenizer = self.get_tokenizer()
-        tile_token_len = len(tokenizer.tokenize("<row_1_col_1>"))
-        glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag))
-        # linebreak and <fake_token_around_image> always cost 1 token
-        fake_token_len = lb_len = 1
-        non_image_token = (grid_w * grid_h) * (
-            tile_token_len + fake_token_len) + glob_token_len + (
-                grid_h + 1) * lb_len + fake_token_len
-        return {"image": num_image_token + non_image_token}
-
     def _resize_output_size(self,
                             *,
                             height: int,
                             width: int,
                             max_len: Optional[int] = None,
-                            min_len: Optional[int] = 1,
+                            min_len: int = 1,
                             max_size: Optional[int] = None) -> tuple[int, int]:
         # Set default value for max_len if not provided
         max_len = max(height, width) if max_len is None else max_len
@@ -181,10 +158,13 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        size: Optional[dict[str, object]] = None,
+        processor: Optional[Idefics3Processor],
     ) -> tuple[int, int]:
-        hf_processor = self.get_hf_processor(size=size)
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
         max_image_size = image_processor.max_image_size['longest_edge']
         size = image_processor.size['longest_edge']
         assert size % max_image_size == 0, (
@@ -204,35 +184,133 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
             grid_h = grid_w = 0
         return grid_w, grid_h
 
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return grid_w * grid_h + 1
+
+    def _get_image_token(
+            self,
+            processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_token = processor.image_token.content
+        fake_image_token = processor.fake_image_token.content
+        global_image_token = processor.global_image_tag
+        return image_token, fake_image_token, global_image_token
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token, fake_image_token, global_img_token = self._get_image_token(
+            processor)
+        image_seq_len = processor.image_seq_len
+        grid_placeholder = "<row_{n_h}_col_{n_w}>"
+
+        p_img = image_token * image_seq_len
+        global_img_placeholder = fake_image_token + global_img_token + p_img
+        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
+
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        if grid_w == 0 and grid_h == 0:
+            return global_img_placeholder + fake_image_token
+
+        tiles_placeholder = list[str]()
+        for i in range(grid_h):
+            for j in range(grid_w):
+                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1,
+                                                                   n_w=j + 1)
+                tiles_placeholder.append(placeholder_per_tile)
+                # Add line break if it is the last tile in the row
+                if j == grid_w - 1:
+                    tiles_placeholder.append("\n")
+
+        return "".join([
+            *tiles_placeholder,
+            "\n",
+            global_img_placeholder,
+            fake_image_token,
+        ])
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_patches = self.get_num_patches(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return num_patches * processor.image_seq_len
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
+        return ImageSize(
+            width=image_processor.size["longest_edge"],
+            height=image_processor.size["longest_edge"],
+        )
+
 
 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                  ):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token, _, _ = self.info._get_image_token(processor)
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
         longest_edge = image_processor.max_image_size['longest_edge']
-        image_token: str = hf_processor.image_token.content
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=longest_edge,
                                    height=longest_edge,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
 
-
-class Idefics3MultimodalProcessor(
+class Idefics3MultiModalProcessor(
         BaseMultiModalProcessor[Idefics3ProcessingInfo]):
 
     def _call_hf_processor(
@@ -241,26 +319,39 @@ class Idefics3MultimodalProcessor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            processed_outputs = super()._call_hf_processor(
-                prompt, mm_data, mm_kwargs)
-            image_grids = [
-                self.info._get_image_feature_grid_size(
-                    image_width=img.width,
-                    image_height=img.height,
-                    **mm_kwargs,
-                ) for img in mm_data["images"]
-            ]
-            image_patches = list(map(lambda x: math.prod(x) + 1, image_grids))
-            for key in ("pixel_values", "pixel_attention_mask"):
-                data = processed_outputs.pop(key)
-                data = data.flatten(0, 1).split(image_patches)
-                processed_outputs[key] = data
-        else:
-            tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+        # Text-only input not supported in composite processor
+        if not (images := mm_data.get("images", [])):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image", ImageProcessorItems))
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+        num_patches = [
+            self.info.get_num_patches(
+                image_width=size.width,
+                image_height=size.height,
+                processor=hf_processor,
+            ) for size in image_sizes
+        ]
+        processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        # Remove the extra batch dimension
+        processed_outputs["pixel_values"].squeeze_(0)
+        processed_outputs["pixel_attention_mask"].squeeze_(0)
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -268,10 +359,15 @@ class Idefics3MultimodalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
         return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            pixel_attention_mask=MultiModalFieldConfig.batched("image"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            pixel_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -281,42 +377,23 @@ class Idefics3MultimodalProcessor(
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token, _, _ = self.info._get_image_token(hf_processor)
 
-        image_token = hf_processor.image_token.content
-        fake_image_token = hf_processor.fake_image_token.content
-        global_img_token = hf_processor.global_image_tag
-        image_seq_len = hf_processor.image_seq_len
-        grid_placeholder = "<row_{n_h}_col_{n_w}>"
-
-        p_img = image_token * image_seq_len
-        global_img_placeholder = fake_image_token + global_img_token + p_img
-        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
-
-        def get_replacement_idefics3(item_idx: int) -> str:
+        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
-            grid_w, grid_h = self.info._get_image_feature_grid_size(
+
+            image_repl = self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
-                **hf_processor_mm_kwargs,
+                processor=hf_processor,
             )
-            if grid_w == 0 and grid_h == 0:
-                image_placeholder = global_img_placeholder
-            else:
-                tiles_placeholder = list[str]()
-                for i in range(grid_h):
-                    for j in range(grid_w):
-                        placeholder_per_tile = tile_img_placeholder.format(
-                            n_h=i + 1, n_w=j + 1)
-                        tiles_placeholder.append(placeholder_per_tile)
-                        # Add line break if it is the last tile in the row
-                        if j == grid_w - 1:
-                            tiles_placeholder.append("\n")
 
-                image_placeholder = "".join(
-                    [*tiles_placeholder, "\n", global_img_placeholder])
-            return image_placeholder + fake_image_token
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
 
         return [
             PromptReplacement(
@@ -424,73 +501,13 @@ class Idefics3Model(nn.Module):
               config.vision_config.patch_size)**2) / (config.scale_factor**2))
         self.image_token_id = self.config.image_token_id
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[ImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-        pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
-
-        if pixel_values is None and image_embeds is None:
-            return None
-
-        if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            return Idefics3ImageEmbeddingInputs(
-                type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
-            )
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if isinstance(pixel_values, list):
-                pixel_values = torch.cat(pixel_values, dim=1)
-                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
-            else:
-                pixel_values = flatten_bn(pixel_values)
-                pixel_attention_mask = flatten_bn(pixel_attention_mask)
-
-            return Idefics3ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-                pixel_attention_mask=pixel_attention_mask)
-
-        raise AssertionError("This line should be unreachable.")
-
-    def _image_pixels_to_features(
+    def image_pixels_to_features(
         self,
         pixel_values: torch.Tensor,
-        pixel_attention_mask: Optional[torch.BoolTensor] = None,
-    ) -> NestedTensors:
+        pixel_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        num_patches = [x.size(0) for x in pixel_values]
         pixel_values = pixel_values.to(
             dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
         )  # fp16 compatibility
@@ -502,17 +519,9 @@ class Idefics3Model(nn.Module):
         pixel_values = pixel_values[real_images_inds].contiguous()
 
         # Handle the vision attention mask
-        if pixel_attention_mask is None:
-            pixel_attention_mask = torch.ones(
-                size=(pixel_values.size(0), pixel_values.size(2),
-                      pixel_values.size(3)),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-        else:
-            # Remove padding images from the mask
-            pixel_attention_mask = pixel_attention_mask[
-                real_images_inds].contiguous()
+        # Remove padding images from the mask
+        pixel_attention_mask = pixel_attention_mask[
+            real_images_inds].contiguous()
 
         patch_size = self.config.vision_config.patch_size
         patches_subgrid = pixel_attention_mask.unfold(dimension=1,
@@ -529,27 +538,7 @@ class Idefics3Model(nn.Module):
             patch_attention_mask=patch_attention_mask,
         )
 
-        return image_hidden_states.split(num_patches)
-
-    def _process_image_pixels(
-            self, inputs: Idefics3ImagePixelInputs) -> NestedTensors:
-        assert self.vision_model is not None
-
-        pixel_values = inputs["data"]
-        pixel_attention_mask = inputs["pixel_attention_mask"]
-
-        return self._image_pixels_to_features(pixel_values,
-                                              pixel_attention_mask)
-
-    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
-        if image_input["type"] == "image_embeds":
-            return image_input["data"]
-
-        assert self.vision_model is not None
-        image_features = self._process_image_pixels(image_input)
-        num_patches = [x.size(0) for x in image_features]
-        image_features = torch.cat(image_features)
-        return self.connector(image_features).split(num_patches)
+        return image_hidden_states
 
     def get_input_embeddings(
         self,
@@ -575,7 +564,7 @@ class Idefics3Model(nn.Module):
 
 
 @MULTIMODAL_REGISTRY.register_processor(
-    Idefics3MultimodalProcessor,
+    Idefics3MultiModalProcessor,
     info=Idefics3ProcessingInfo,
     dummy_inputs=Idefics3DummyInputsBuilder)
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
@@ -616,13 +605,107 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Idefics3ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_attention_mask = kwargs.pop("pixel_attention_mask")
+            if not isinstance(pixel_attention_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel_attention_mask. "
+                                 f"Got type: {type(pixel_attention_mask)}")
+
+            num_patches = kwargs.pop("num_patches")
+            if not isinstance(num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_patches. "
+                                 f"Got type: {type(num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            pixel_attention_mask = flatten_bn(pixel_attention_mask,
+                                              concat=True)
+            num_patches = flatten_bn(num_patches, concat=True)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask,
+                num_patches=num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_pixels(
+            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["pixel_values"]
+        pixel_attention_mask = inputs["pixel_attention_mask"]
+
+        return self.model.image_pixels_to_features(
+            pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: ImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_features = self._process_image_pixels(image_input)
+        image_features = self.model.connector(image_features)
+
+        num_patches = image_input["num_patches"]
+        return [
+            e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        image_input = self.model._parse_and_validate_image_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self.model._process_image_input(image_input)
-        return vision_embeddings
+
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -632,8 +715,11 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_id)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_id,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c77324bab59c6..0cda199af471f 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -56,6 +56,18 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
+    def get_language_model(self) -> torch.nn.Module:
+        """
+        Returns the underlying language model used for text generation.
+
+        This is typically the `torch.nn.Module` instance responsible for 
+        processing the merged multimodal embeddings and producing hidden states
+
+        Returns:
+            torch.nn.Module: The core language model component.
+        """
+        ...
+
     # Only for models that support v0 chunked prefill
     # TODO(ywang96): Remove this overload once v0 is deprecated
     @overload
@@ -411,6 +423,35 @@ def is_hybrid(
     return isinstance(model, IsHybrid)
 
 
+@runtime_checkable
+class HasNoOps(Protocol):
+    has_noops: ClassVar[Literal[True]] = True
+
+
+@runtime_checkable
+class _HasNoOpsType(Protocol):
+    has_noops: ClassVar[Literal[True]]
+
+
+@overload
+def has_noops(model: object) -> TypeIs[HasNoOps]:
+    ...
+
+
+@overload
+def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
+    ...
+
+
+def has_noops(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
+    if isinstance(model, type):
+        return isinstance(model, _HasNoOpsType)
+
+    return isinstance(model, HasNoOps)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e1aa371610353..8f5f454cbf607 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -25,22 +25,20 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -61,18 +59,10 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: NestedTensors
+    data: Union[torch.Tensor, list[torch.Tensor]]
     """ 
     A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
     or a list of tensors of shape `(total_image_feature_size, hidden_size)`
@@ -420,24 +410,12 @@ class BaseInternVLProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
-
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -461,7 +439,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -480,13 +458,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_num_image_tokens(
         self,
         *,
@@ -502,15 +473,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
             image_height=image_height,
         )
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
 
@@ -542,27 +504,27 @@ _I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
 
 class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
@@ -600,7 +562,6 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -832,7 +793,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -861,10 +821,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
 
@@ -873,7 +829,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -881,7 +836,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -913,23 +868,16 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         else:
             self.visual_token_mask = None
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values"):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -943,7 +891,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
@@ -964,7 +912,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
new file mode 100644
index 0000000000000..1520f6992f0a9
--- /dev/null
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import copy
+import math
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple,
+                    TypedDict, Union)
+
+import torch
+from torch import nn
+from transformers import BatchFeature
+from transformers.activations import GELUActivation
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
+from vllm.model_executor.models.utils import merge_multimodal_embeddings
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
+from .utils import is_pp_missing_parameter, maybe_prefix
+
+
+# For dummy input only
+@dataclass
+class MaxImageTokenMeta:
+    width: int = 1024
+    height: int = 1024
+
+
+class KimiVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config: KimiVLConfig):
+        super().__init__()
+
+        self.hidden_size = (config.vision_config.hidden_size *
+                            config.vision_config.merge_kernel_size[0] *
+                            config.vision_config.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
+                                           eps=1e-5)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(
+            -1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class KimiVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape:`(num_patches, num_channels, patch_size, patch_size)`
+    """
+
+    image_grid_hws: torch.Tensor
+    """Shape:`(num_images, 2)`"""
+
+
+# TODO: support embeds too
+# We only support pixel input for kimi-vl now
+KimiVLImageInputs = KimiVLImagePixelInputs
+
+
+class KimiVLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(KimiVLConfig)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_processor = self.get_hf_processor()
+        patch_size = hf_processor.image_processor.patch_size
+        kernel_size = hf_processor.image_processor.merge_kernel_size
+        in_token_limit = hf_processor.image_processor.in_token_limit
+        height = image_height
+        width = image_width
+        assert isinstance(height,
+                          int), f"height must be int, current height {height}"
+        assert isinstance(width,
+                          int), f"width must be int, current width {width}"
+        assert kernel_size is not None, "kernel_size must be specified"
+
+        if (width // patch_size) * (height // patch_size) > in_token_limit:
+            scale = math.sqrt(in_token_limit / ((width // patch_size) *
+                                                (height // patch_size)))
+            new_w, new_h = int(width * scale), int(height * scale)
+            width, height = new_w, new_h
+
+        kernel_height, kernel_width = kernel_size
+
+        pad_height = (kernel_height * patch_size - height %
+                      (kernel_height * patch_size)) % (kernel_height *
+                                                       patch_size)
+        pad_width = (kernel_width * patch_size - width %
+                     (kernel_width * patch_size)) % (kernel_width * patch_size)
+
+        # Calculate new dimensions after padding and patching
+        token_height = (height + pad_height) // (kernel_size[0] * patch_size)
+        token_width = (width + pad_width) // (kernel_size[1] * patch_size)
+        return int(token_height * token_width)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.get_hf_config().media_placeholder_token_id
+
+
+class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=MaxImageTokenMeta.width,
+                                   height=MaxImageTokenMeta.height,
+                                   num_images=num_images)
+        }
+
+
+class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_hws = hf_inputs.get("image_grid_hws", torch.empty((0, 2)))
+        image_grid_sizes = image_grid_hws.prod(-1)
+
+        # pixel_values is merged as a single large tensor
+        # image_grid_hws is shapes for each subtensor in pixel_values
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_hws=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_token_id = self.info.image_token_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
+                                        info=KimiVLProcessingInfo,
+                                        dummy_inputs=KimiVLDummyInputsBuilder)
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        model_config = vllm_config.model_config
+        config: KimiVLConfig = model_config.hf_config
+        self.config = config
+        quant_config = vllm_config.quant_config
+
+        assert isinstance(config.vision_config, MoonViTConfig)
+
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+
+        self.quant_config = quant_config
+        sub_vllm_config = copy.deepcopy(vllm_config)
+        sub_vllm_config.model_config.hf_config = sub_vllm_config.model_config.hf_config.text_config
+        self.language_model = DeepseekV2Model(
+            vllm_config=sub_vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.config.text_config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = get_sampler()
+        self.media_placeholder: int = self.config.media_placeholder_token_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_world_size = get_tensor_model_parallel_world_size()
+
+    # ref: qwen2_vl.py
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return mm_input.reshape(-1, mm_input.shape[-1])
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KimiVLImageInputs]:
+        # image input type must be pixel values now
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_hws = kwargs.pop("image_grid_hws", None)
+
+        if pixel_values is None:
+            return None
+
+        image_grid_hws = self._validate_and_reshape_mm_tensor(
+            image_grid_hws, "image grid hws")
+        # pixel_values may have complex shapes
+        num_channels = 3
+        patch_size = self.config.vision_config.patch_size
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat([
+                x.reshape(-1, num_channels, patch_size, patch_size)
+                for x in pixel_values
+            ])
+        else:
+            pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
+                                                patch_size)
+        # fp32 -> bf16
+        pixel_values = pixel_values.to(torch.bfloat16)
+        # image_grid_hws.shape = (N, 2)
+        assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
+
+        return KimiVLImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            image_grid_hws=image_grid_hws,
+        )
+
+    # perform vt on processored pixel_values
+    @torch.inference_mode()
+    def _process_image_pixels(self,
+                              inputs: KimiVLImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+        image_grid_hws = inputs["image_grid_hws"]
+        return self.vision_tower(pixel_values, image_grid_hws)
+
+    def _process_image_input(self,
+                             image_input: KimiVLImageInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        image_features = self._process_image_pixels(image_input)
+        assert isinstance(image_features, list)
+        lengths = [x.shape[0] for x in image_features]
+        return self.multi_modal_projector(
+            torch.cat(image_features)).split(lengths)
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> Optional[NestedTensors]:
+        # Validate the multimodal input keyword arguments
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        # Run multimodal inputs through encoder and projector
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        # `get_input_embeddings` should already be implemented for the language
+        # model as one of the requirements of basic vLLM model implementation.
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.media_placeholder_token_id)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+                image_embeds = self._process_image_input(image_input)
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                    placeholder_token_id=self.config.
+                    media_placeholder_token_id,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata,
+                       **kwargs) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, **kwargs)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        config = self.config.text_config
+        _KEYS_TO_MODIFY_MAPPING = {
+            "language_model.lm_head": "lm_head",
+            "language_model.model": "language_model",
+        }
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if not config.use_mla:
+            stacked_params_mapping += [
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+            ]
+        if getattr(config, "n_routed_experts", None):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=config.n_routed_experts)
+        else:
+            expert_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if (("mlp.experts." in name) and name not in params_dict):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id, **kwargs)
+                    break
+                else:
+                    for idx, (param_name, weight_name, expert_id,
+                              shard_id) in enumerate(expert_params_mapping):
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        if is_pp_missing_parameter(name, self):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param,
+                                      loaded_weight,
+                                      name,
+                                      expert_id=expert_id,
+                                      shard_id=shard_id,
+                                      **kwargs)
+                        break
+                    else:
+                        use_default_weight_loading = True
+            if use_default_weight_loading:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight, **kwargs)
+
+
+def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config,
+                                        weight_name: str) -> Optional[int]:
+    if hasattr(config,
+               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
+                                                > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 81b5d9bda9acd..caa4a5108a923 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -22,7 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -65,6 +65,7 @@ class LlamaMLP(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         prefix: str = "",
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -79,6 +80,7 @@ class LlamaMLP(nn.Module):
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            reduce_results=reduce_results,
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
@@ -292,7 +294,7 @@ class LlamaModel(nn.Module):
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
-                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -466,10 +468,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "ffn_norm": "post_attention_layernorm",
         "tok_embeddings": "model.embed_tokens",
         "output": "lm_head",
-        "norm": "model.norm"
+        "norm": "model.norm",
     }
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -478,7 +484,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.lora_config = lora_config
 
         self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      layer_type=layer_type)
 
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
@@ -513,8 +520,13 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
-        return LlamaModel(vllm_config=vllm_config, prefix=prefix)
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
+        return LlamaModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          layer_type=layer_type)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
new file mode 100644
index 0000000000000..e5d1a671f5d6f
--- /dev/null
+++ b/vllm/model_executor/models/llama4.py
@@ -0,0 +1,530 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Llama4TextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
+                    is_pp_missing_parameter)
+
+
+class Llama4MoE(nn.Module):
+
+    @staticmethod
+    def custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
+        # psuedo-standard is that the router scores are floats
+        router_scores = torch.sigmoid(router_scores.float())
+        return (router_scores, router_indices.to(torch.int32))
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+
+        intermediate_size_moe = config.intermediate_size
+        self.router = ReplicatedLinear(config.hidden_size,
+                                       config.num_local_experts,
+                                       bias=False,
+                                       quant_config=None,
+                                       prefix=f"{prefix}.router")
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            intermediate_size=intermediate_size_moe,
+            apply_router_weight_on_input=True,
+            reduce_results=False,
+            renormalize=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts")
+
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.shared_expert",
+            reduce_results=False,  # We need to do scatter before reduce
+        )
+
+    def forward(self, hidden_states):
+        router_logits, _ = self.router(hidden_states)
+        shared_out = self.shared_expert(hidden_states)
+        routed_out = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+        )
+        experts_out = routed_out + shared_out
+
+        if self.tp_size > 1:
+            experts_out = tensor_model_parallel_all_reduce(experts_out)
+
+        return experts_out
+
+
+class Llama4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 rope_theta: float = 10000,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 max_position_embeddings: int = 8192,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = False,
+                 bias_o_proj: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        self.no_rope_layers = config.no_rope_layers
+        self.nope = self.no_rope_layers[self.layer_idx] == 0
+        self.use_qk_norm = config.use_qk_norm and not self.nope
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        # TODO: attn_temperature_tuning should be a bool in huggingface
+        self.attn_temperature_tuning = self.nope and \
+            config.attn_temperature_tuning > 0
+
+        self.floor_scale = getattr(config, "floor_scale", 8192.0)
+        self.attn_scale = getattr(config, "attn_scale", 0.1)
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.qk_norm = RMSNorm(
+            hidden_size=self.head_dim,
+            eps=config.rms_norm_eps,
+            has_weight=False,
+            dtype=torch.float32,
+        ) if self.use_qk_norm else None
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            rope_scaling=rope_scaling if rope_scaling != "default" else None,
+            is_neox_style=is_neox_style,
+        ) if not self.nope else None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=None,
+            use_irope=not self.nope,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        floor = torch.floor((positions + 1.0) / self.floor_scale)
+        attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+
+        return attn_scale.unsqueeze(-1)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+        if self.qk_norm is not None:
+            q = q.reshape(-1, self.num_heads, self.head_dim)
+            q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype)
+            k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+            k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype)
+
+        # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)
+        # to NoPE layers, where the inference-time temperature tuning function
+        # is customized to not affect short context
+        # while working at very long context
+        # https://arxiv.org/abs/2501.19399
+        #
+        # We should apply temperature tuning between (after) rotary / QK norm
+        # and (before) attention.
+        if self.attn_temperature_tuning and self.nope:
+            attn_scale = self._get_attn_scale(positions)
+            q = (q * attn_scale).to(q.dtype)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Llama4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        rope_theta = config.rope_theta
+        rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+
+        self.self_attn = Llama4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            bias_o_proj=False,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        is_moe_layer = (self.layer_idx +
+                        1) % config.interleave_moe_layer_step == 0
+        if is_moe_layer:
+            self.feed_forward = Llama4MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size_mlp,
+                hidden_act="silu",
+                quant_config=quant_config,
+                bias=False,
+                prefix=f"{prefix}.feed_forward",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Llama4Model(LlamaModel):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        self.num_experts = vllm_config.model_config.hf_config.num_local_experts
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+
+    def load_moe_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: Dict[str, nn.Parameter],
+        loaded_params: Set[str],
+        expert_params_mapping: List[Tuple[str, str, int, str]],
+        fused: bool = True,
+    ) -> bool:
+        expert_param_loaded = False
+        if "experts.gate_up_proj" in name:
+            loaded_weight = loaded_weight.chunk(2, dim=-1)
+        for (param_name, weight_name, expert_id,
+             shard_id) in expert_params_mapping:
+            new_loaded_weight = loaded_weight
+            if fused:
+                e_str, _, proj_str, _ = weight_name.split('.')
+                weight_name = f"{e_str}.{proj_str}"
+                param_name = f"{param_name}weight"
+            if weight_name not in name:
+                continue
+            full_param_name = name.replace(weight_name, param_name)
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                    and name not in params_dict):
+                continue
+            param = params_dict[full_param_name]
+            weight_loader = param.weight_loader
+            if fused:
+                if "w13" in full_param_name:
+                    shard_idx = 0 if shard_id == "w1" else 1
+                    new_loaded_weight = new_loaded_weight[shard_idx]
+                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+                layer_idx = extract_layer_index(name)
+                # EP mapping
+                expert_map = self.layers[
+                    layer_idx].feed_forward.experts.expert_map
+                if expert_map is not None:
+                    local_expert_indices = (expert_map != -1) \
+                                            .nonzero() \
+                                            .flatten() \
+                                            .to(new_loaded_weight.device)
+                    new_loaded_weight = new_loaded_weight[local_expert_indices]
+                    expert_id = local_expert_indices[0].item()
+            else:
+                # TODO: add EP support for non fused weights
+                pass
+            weight_loader(param,
+                          new_loaded_weight,
+                          full_param_name,
+                          shard_id=shard_id,
+                          expert_id=expert_id)
+
+            loaded_params.add(full_param_name)
+            expert_param_loaded = True
+        return expert_param_loaded
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        fused_experts_params = False
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.num_experts)
+        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="gate_up_proj",
+            num_experts=1)
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                fused_experts_params = True
+                expert_params_mapping = expert_params_mapping_fused
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                moe_loaded = self.load_moe_expert_weights(
+                    name,
+                    loaded_weight,
+                    params_dict,
+                    loaded_params,
+                    expert_params_mapping,
+                    fused=fused_experts_params)
+
+                if not moe_loaded:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+        return loaded_params
+
+
+class Llama4ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # update temperature tuning config from generation config
+        gen_config = vllm_config.model_config.try_get_generation_config()
+        gen_config.update(vllm_config.model_config.override_generation_config)
+        # enable temperature tuning by default when max_model_len > 32K
+        default_attn_temperature_tuning = \
+            vllm_config.model_config.max_model_len > 32768
+        vllm_config.model_config.hf_config.attn_temperature_tuning \
+            = gen_config.get(
+                "attn_temperature_tuning", default_attn_temperature_tuning)
+
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Llama4DecoderLayer)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        return Llama4Model(vllm_config=vllm_config,
+                           prefix=prefix,
+                           layer_type=layer_type)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        weights = [
+            self.permute_qk_weight_for_rotary(name, loaded_weight)
+            for name, loaded_weight in weights
+        ]
+        return loader.load_weights(weights)
+
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> Tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.config.head_dim * n_heads
+            attn_out = self.config.hidden_size
+
+            return w.view(n_heads, attn_in // n_heads // 2, 2,
+                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) \
+           and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+        elif ("wq" in modules or "q_proj" in modules) \
+                and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        return name, loaded_weight
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
new file mode 100644
index 0000000000000..28ad6128c4f19
--- /dev/null
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Iterable, Set, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import LlamaConfig
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM)
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        disable_input_layernorm: bool,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, prefix=prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if disable_input_layernorm:
+            del self.input_layernorm
+            self.input_layernorm = nn.Identity()
+
+
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        model_config: ModelConfig,
+        start_layer_id: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(
+                self.config,
+                i == 0,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        return hidden_states + residual
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleLlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, model_config: ModelConfig, start_layer_id: int = 0):
+        nn.Module.__init__(self)
+        self.config = model_config.hf_config
+        self.model = LlamaModel(model_config=model_config,
+                                start_layer_id=start_layer_id,
+                                prefix="model")
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d1014067d9d7c..fbd212d170044 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,10 +32,10 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -43,8 +43,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -68,14 +67,6 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -146,13 +137,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def _apply_feature_select_strategy(
         self,
         strategy: str,
@@ -202,30 +186,31 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
 class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
         processor = self.info.get_hf_processor()
         image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
@@ -344,23 +329,6 @@ class PixtralHFMultiModalProcessor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -370,7 +338,6 @@ class PixtralHFMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -405,7 +372,7 @@ class PixtralHFMultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -613,15 +580,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                embed_is_patch = kwargs.pop("embed_is_patch")
-                if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of embed_is_patch. "
-                                     f"Got type: {type(embed_is_patch)}")
-
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
-                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -707,24 +668,16 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
-        vision_embeddings = self._process_image_input(image_input)
-
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values_pixtral"):
-            # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return vision_embeddings
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                vision_embeddings,
-                image_input["embed_is_patch"],
-            ))
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -736,7 +689,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
@@ -790,7 +743,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
@@ -870,6 +822,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
+        mm_hashes = result["mm_hashes"]
 
         # We reimplement the functionality of MLlavaProcessor from
         # https://github.com/TIGER-AI-Lab/Mantis.git
@@ -918,6 +871,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
             prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 4de13e5407354..9c4d0e1fc275e 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -480,6 +480,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 8b1a8c9da6804..0221c6b237cbb 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,13 +16,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -61,21 +62,6 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"video": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_video_tokens = self.get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
-        )
-
-        return {"video": max_video_tokens}
-
     def get_image_size_with_most_features(self) -> ImageSize:
         vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
@@ -130,9 +116,12 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_videos = mm_counts.get("video", 0)
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
@@ -142,22 +131,27 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 class LlavaNextVideoDummyInputsBuilder(
         BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_videos = mm_counts.get("video", 0)
 
         processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
+        return video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "video":
             self._get_dummy_videos(
                 width=target_width,
@@ -167,11 +161,6 @@ class LlavaNextVideoDummyInputsBuilder(
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaNextVideoMultiModalProcessor(
         BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
@@ -402,20 +391,24 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                                h, w)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return stacked_embeddings.view(b, num_frames,
-                                           *stacked_embeddings.shape[1:])
+            embeds = stacked_embeddings.view(b, num_frames,
+                                             *stacked_embeddings.shape[1:])
 
         elif is_list_of(video_pixels, torch.Tensor):
             frames_per_videos = [v.shape[0] for v in video_pixels]
             stacked_pixels = torch.cat(video_pixels, dim=0)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
-
+            embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
         else:
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
+        return [e.flatten(0, 1) for e in embeds]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index fbc298b812498..ab2bf881ad65b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,11 +19,11 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
-from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -101,16 +101,6 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
-        }
-
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
@@ -202,10 +192,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -215,24 +208,25 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
         )
 
 
 class LlavaOnevisionDummyInputsBuilder(
         LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -240,12 +234,23 @@ class LlavaOnevisionDummyInputsBuilder(
         image_token = processor.image_token
         video_token = processor.video_token
 
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len,
+                                                        mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
@@ -259,11 +264,6 @@ class LlavaOnevisionDummyInputsBuilder(
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaOnevisionMultiModalProcessor(
         BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
@@ -583,21 +583,21 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
 
-        return modalities
+        return mm_input_by_modality
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -843,10 +843,14 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -855,14 +859,13 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += tuple(vision_embeddings)
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_pixels(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_pixels(multimodal_input)
                 multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 7a525ad8e494f..ac95b65fd03f2 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -27,7 +27,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -154,6 +154,26 @@ class MambaModel(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
                        SupportsV0Only):
@@ -257,20 +277,5 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index da5cbddbcbc58..526dec46ff29a 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -13,6 +13,8 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -57,7 +59,6 @@ class Mamba2DecoderLayer(nn.Module):
                                  head_dim=config.head_dim,
                                  rms_norm_eps=config.layer_norm_epsilon,
                                  activation=config.hidden_act,
-                                 chunk_size=config.chunk_size,
                                  quant_config=quant_config)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -67,7 +68,7 @@ class Mamba2DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor],
+        mamba2_metadata: Mamba2Metadata,
         **kwargs,
     ):
         if residual is None:
@@ -77,7 +78,7 @@ class Mamba2DecoderLayer(nn.Module):
             hidden_states, residual = self.norm(hidden_states, residual)
 
         hidden_states = self.mixer(hidden_states, mamba_cache_params,
-                                   sequence_idx)
+                                   mamba2_metadata)
         return hidden_states, residual
 
 
@@ -138,20 +139,13 @@ class Mamba2Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -162,7 +156,7 @@ class Mamba2Model(nn.Module):
                 residual=residual,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(
                     i - self.start_layer),
-                sequence_idx=seq_idx)
+                mamba2_metadata=mamba2_metadata)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index d529833093cea..25839727898fb 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
+from typing import Tuple
 
 import torch
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
 
 
 @dataclass
@@ -21,7 +22,7 @@ class MambaCacheParams:
                                 self.state_indices_tensor)
 
 
-class MambaCacheManager:
+class MambaCacheManager(ConstantSizeCache):
 
     def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
                  num_mamba_layers: int, conv_state_shape: Tuple[int, int],
@@ -32,6 +33,9 @@ class MambaCacheManager:
         if not vllm_config.model_config.enforce_eager:
             max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)
 
+        # Initialize parent class
+        super().__init__(max_batch_size)
+
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  conv_state_shape,
                                  dtype=dtype,
@@ -41,126 +45,32 @@ class MambaCacheManager:
                                      dtype=dtype,
                                      device="cuda")
 
-        self.mamba_cache = (conv_state, temporal_state)
+        self._mamba_cache = (conv_state, temporal_state)
 
-        # Maps between the request id and a dict that maps between the seq_id
-        # and its index inside the self.mamba_cache
-        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
-        self.free_cache_indices = list(range(max_batch_size))
+    @property
+    def cache(self):
+        return self._mamba_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
 
     def current_run_tensors(self, **kwargs) -> MambaCacheParams:
         """
         Return the tensors for the current run's conv and ssm state.
         """
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            # We get here only on Prefill/Eager mode runs
-            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-            finished_requests_ids = kwargs["finished_requests_ids"]
-
-            self._release_finished_requests(finished_requests_ids)
-            state_indices = self._prepare_current_run_mamba_cache(
-                request_ids_to_seq_ids, finished_requests_ids)
-
-            state_indices_tensor = torch.as_tensor(state_indices,
-                                                   dtype=torch.int32,
-                                                   device="cuda")
-            mamba_cache_tensors = self.mamba_cache
-
-        else:
-            # CUDA graph capturing runs
-            (mamba_cache_tensors,
-             state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
-
-        return MambaCacheParams(mamba_cache_tensors[0], mamba_cache_tensors[1],
+        cache_tensors, state_indices_tensor = super().current_run_tensors(
+            **kwargs)
+        return MambaCacheParams(cache_tensors[0], cache_tensors[1],
                                 state_indices_tensor)
 
-    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant state_indices into the CUDA graph input buffer 
-        """
-        assert all(
-            key in kwargs
-            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
-        finished_requests_ids = kwargs["finished_requests_ids"]
-        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        assert "seqlen_agnostic_capture_inputs" in input_buffers
-        _, input_state_indices_buffer = input_buffers[
-            "seqlen_agnostic_capture_inputs"]
-
-        self._release_finished_requests(finished_requests_ids)
-        state_indices = self._prepare_current_run_mamba_cache(
-            request_ids_to_seq_ids, finished_requests_ids)
-        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
-            state_indices)
-        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
-
-        input_state_indices_buffer.copy_(
-            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
-
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         """
         Provide the CUDA graph capture runs with a buffer in adjusted size.
         The buffer is used to maintain the Mamba Cache during the CUDA graph
         replay runs.
         """
-        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
-                                               dtype=torch.int32,
-                                               device="cuda")
-        return (self.mamba_cache, state_indices_tensor)
-
-    def _copy_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, to_index].copy_(cache_t[:, from_index],
-                                       non_blocking=True)
-
-    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
-                                      finished_requests_ids) -> int:
-        """
-        Assign (req_id,seq_id) pair to a `destination_index` index, if
-        already occupied, move the occupying index to a free index.
-        """
-        if cur_rid in finished_requests_ids:
-            # set as pad, do not allocate destination index
-            return PAD_SLOT_ID
-        elif cur_rid not in self.mamba_cache_indices_mapping:
-            destination_index = self.free_cache_indices.pop()
-            self.mamba_cache_indices_mapping[cur_rid] = {
-                seq_id: destination_index
-            }
-            return destination_index
-        elif seq_id not in (seq_ids2indices :=
-                            self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill have
-            # already happened, so we copy the
-            # existing cache into the siblings seq_ids caches
-            index_exists = next(iter(seq_ids2indices.values()))
-            # case of decoding n>1, copy prefill cache to decoding indices
-            destination_index = self.free_cache_indices.pop()
-            self._copy_mamba_cache(from_index=index_exists,
-                                   to_index=destination_index)
-            self.mamba_cache_indices_mapping[cur_rid][
-                seq_id] = destination_index
-            return destination_index
-        else:
-            # already exists
-            return self.mamba_cache_indices_mapping[cur_rid][seq_id]
-
-    def _prepare_current_run_mamba_cache(
-            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            finished_requests_ids: List[str]) -> List[int]:
-        return [
-            self._assign_seq_id_to_cache_index(req_id, seq_id,
-                                               finished_requests_ids)
-            for req_id, seq_ids in request_ids_to_seq_ids.items()
-            for seq_id in seq_ids
-        ]
-
-    def _release_finished_requests(self,
-                                   finished_seq_groups_req_ids: List[str]):
-        for req_id in finished_seq_groups_req_ids:
-            if req_id in self.mamba_cache_indices_mapping:
-                for seq_id in self.mamba_cache_indices_mapping[req_id]:
-                    self.free_cache_indices.append(
-                        self.mamba_cache_indices_mapping[req_id][seq_id])
-                self.mamba_cache_indices_mapping.pop(req_id)
+        return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                                  dtype=torch.int32,
+                                                  device="cuda")
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 1312b1051732f..a2ca92cdec072 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,8 +23,8 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Any, Callable, Dict, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 from torch import nn
@@ -35,17 +35,17 @@ from transformers.models.whisper.modeling_whisper import (
 
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import PromptReplacement, PromptUpdate
-from vllm.multimodal.profiling import ProcessorInputs
-from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 
-from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
+                       MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
                        MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
                        _minicpmv_field_config)
@@ -57,7 +57,7 @@ CPU_DEVICE = torch.device("cpu")
 
 class MiniCPMOAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    audio_features: torch.Tensor
+    audio_features: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
     Slice here means chunk. Audio that is too long will be split into slices,
@@ -65,38 +65,25 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     Padding is used therefore `audio_features` is `torch.Tensor`.
     """
 
-    audio_feature_lens: torch.Tensor
+    audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices)`
+    Shape: `(batch_size * num_audios, num_slices)`
 
     This should be feature length of each audio slice, 
     which equals to `audio_features.shape[-1]`
     """
 
-    audio_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
-
-    This should be in `(start, stop)` format.
-    """
-
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    audio_embeds: torch.Tensor
+    audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+    Shape: `(batch_size * num_audios, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     Length of each slice may vary, so pass it as a list.
     """
-    audio_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
-
-    This should be in `(start, stop)` format.
-    """
 
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
@@ -104,11 +91,15 @@ MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_features = hf_inputs.get("audio_features", torch.empty(0))
+    num_audios = len(audio_features)
+
     return dict(
         **_minicpmv_field_config(hf_inputs),
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
 
@@ -135,7 +126,7 @@ class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
     def _parse_audio_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMOAudioEmbeddingItems(
                 data,
@@ -149,18 +140,21 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None, "audio": None}
+        return {**super().get_supported_mm_limits(), "audio": None}
 
-    def get_mm_max_tokens_per_item(
+    def get_audio_placeholder(
         self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
-        }
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+
+        return hf_processor.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
 
     def get_default_audio_pool_step(self) -> int:
         return 2
@@ -175,69 +169,68 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
-        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
 
     def get_max_audio_tokens(self) -> int:
-        return self.get_max_audio_tokens_per_chunk(
-        ) * self.get_max_audio_chunks_with_most_features()
+        num_chunks = self.get_max_audio_chunks_with_most_features()
+        return self.get_max_audio_tokens_per_chunk() * num_chunks
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        # exclude <audio> </audio>
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
-        max_audios = mm_config.get_limit_per_prompt("audio")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_audios = mm_counts.get("audio", 0)
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
-        max_audio_tokens = self.get_max_audio_tokens(
-        ) * max_audios + 2 * max_audios
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_audio_tokens = self.get_max_audio_tokens() * max_audios
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens -
                                                      max_audio_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
 
 class MiniCPMODummyInputsBuilder(
         MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-            self, seq_len: int, mm_counts: Mapping[str,
-                                                   int]) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return super().get_dummy_text(mm_counts) + audio_prompt_texts
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         audio_len = self.info.get_max_audio_chunks_with_most_features() * \
             self.info.get_default_audio_sampling_rate()
 
-        processor_inputs = super().get_dummy_processor_inputs(
-            seq_len, mm_counts)
-        mm_data = {
-            "image":
-            processor_inputs.mm_data["image"],
-            "video":
-            processor_inputs.mm_data["video"],
+        audio_mm_data = {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        audio_prompt_texts = self.info.audio_pattern * num_audios
-
-        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
-                               audio_prompt_texts,
-                               mm_data=mm_data)
+        return {
+            **super().get_dummy_mm_data(seq_len, mm_counts),
+            **audio_mm_data,
+        }
 
 
 class MiniCPMOMultiModalProcessor(
@@ -247,22 +240,17 @@ class MiniCPMOMultiModalProcessor(
         return MiniCPMOMultiModalDataParser(
             target_sr=self.info.get_default_audio_sampling_rate())
 
-    def get_audio_prompt_texts(self,
-                               audio_lens: int,
-                               chunk_input: bool = True,
-                               chunk_length: int = 1) -> str:
-        return self.info.get_hf_processor().get_audio_placeholder(
-            audio_lens, chunk_input, chunk_length)
-
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
-        tokenizer = self.info.get_tokenizer()
-        special_tokens = super().get_special_tokens()
-        if hasattr(tokenizer, "audio_start_id"):
-            special_tokens["audio_start_id"] = torch.tensor(
-                tokenizer.audio_start_id)
-            special_tokens["audio_end_id"] = torch.tensor(
-                tokenizer.audio_end_id)
-        return special_tokens
+    def get_audio_prompt_texts(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        return self.info.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
 
     def process_audios(
         self,
@@ -274,32 +262,38 @@ class MiniCPMOMultiModalProcessor(
 
         parsed_audios = (self._get_data_parser().parse_mm_data({
             "audio": audios
-        }).get_items("audio", AudioProcessorItems))
+        }).get_items("audio",
+                     (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)))
 
-        audio_inputs = self._base_call_hf_processor(
-            prompts=[self.info.audio_pattern] * len(parsed_audios),
-            mm_data={"audios": [[audio] for audio in parsed_audios]},
-            mm_kwargs={
-                **mm_kwargs, "chunk_input": True
-            },
-            out_keys={"audio_features", "audio_feature_lens"},
-        )
-
-        # Avoid padding since we need the output for each audio to be
-        # independent of other audios for the cache to work correctly
-        unpadded_audio_features = [
-            feat[:, :feature_len] for feat, feature_len in zip(
-                audio_inputs["audio_features"],
-                audio_inputs["audio_feature_lens"],
+        if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
+            audio_inputs = {}
+        else:
+            audio_inputs = self._base_call_hf_processor(
+                prompts=[self.info.audio_pattern] * len(parsed_audios),
+                mm_data={"audios": [[audio] for audio in parsed_audios]},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "chunk_input": True,
+                },
+                out_keys={"audio_features", "audio_feature_lens"},
             )
-        ]
-        audio_inputs["audio_features"] = unpadded_audio_features
+
+            # Avoid padding since we need the output for each audio to be
+            # independent of other audios for the cache to work correctly
+            unpadded_audio_features = [
+                feat[:, :feature_len] for feat, feature_len in zip(
+                    audio_inputs["audio_features"],
+                    audio_inputs["audio_feature_lens"],
+                )
+            ]
+            audio_inputs["audio_features"] = unpadded_audio_features
+
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
         return audio_inputs
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video|audio)>./</\1>\)"
-
     def process_mm_inputs(
         self,
         mm_data: Mapping[str, object],
@@ -331,12 +325,14 @@ class MiniCPMOMultiModalProcessor(
             if isinstance(audios, MiniCPMOAudioEmbeddingItems):
                 single_audio_embeds = audios.get(item_idx)["audio_embeds"]
                 audio_len = self.info.get_audio_len_by_num_chunks(
-                    sum(chunk_embeds.shape[0]
-                        for chunk_embeds in single_audio_embeds))
+                    sum(map(len, single_audio_embeds)))
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return self.get_audio_prompt_texts(audio_len)
+            return PromptUpdateDetails.select_text(
+                self.get_audio_prompt_texts(audio_len),
+                "<unk>",
+            )
 
         return [
             *base_updates,
@@ -514,6 +510,8 @@ class MiniCPMO(MiniCPMV2_6):
         self.apm = self.init_audio_module(vllm_config=vllm_config,
                                           prefix=maybe_prefix(prefix, "apm"))
 
+        self.audio_token_id = None
+
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
@@ -563,18 +561,30 @@ class MiniCPMO(MiniCPMV2_6):
 
         return input_lengths_after_cnn, input_lengths_after_pooling
 
-    # Copied from HF repo of MiniCPM-o-2_6,
-    # designed for batched inputs and outputs
-    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
-                                chunk_length: int) -> list[torch.Tensor]:
-        wavforms = data.get(
-            "audio_features",
-            [])  # (bs, 80, frames) or [], multi audios need filled in advance
-        audio_feature_lens_raw = [data.get("audio_feature_lens",
-                                           [])]  # list, [[x1, x2], [y1], [z1]]
+    def get_audio_hidden_states(
+            self, data: MiniCPMOAudioFeatureInputs) -> list[torch.Tensor]:
+        chunk_length = self.config.audio_chunk_length
 
-        if len(wavforms) == 0:
-            return []
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms_raw = data["audio_features"]
+        if isinstance(wavforms_raw, list):
+            B = len(wavforms_raw)
+            C = wavforms_raw[0].shape[-2]
+            L = max(item.shape[-1] for item in wavforms_raw)
+            device = wavforms_raw[0].device
+            dtype = wavforms_raw[0].dtype
+
+            wavforms = torch.zeros((B, C, L), dtype=dtype, device=device)
+            for i, wavforms_item in enumerate(wavforms_raw):
+                L_item = wavforms_item.shape[-1]
+                wavforms[i, ..., :L_item] = wavforms_item
+        else:
+            wavforms = wavforms_raw
+
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = data["audio_feature_lens"]
+        if isinstance(audio_feature_lens_raw, torch.Tensor):
+            audio_feature_lens_raw = audio_feature_lens_raw.unbind(0)
 
         audio_feature_lens = torch.hstack(audio_feature_lens_raw)
         batch_size, _, max_mel_seq_len = wavforms.shape
@@ -625,159 +635,91 @@ class MiniCPMO(MiniCPMV2_6):
 
         num_audio_tokens = feature_lens_after_pooling
 
-        final_audio_embeds = []
+        final_audio_embeds = list[torch.Tensor]()
         idx = 0
         for i in range(len(audio_feature_lens_raw)):
-            target_audio_embeds = []
+            target_audio_embeds_lst = list[torch.Tensor]()
             for _ in range(len(audio_feature_lens_raw[i])):
-                target_audio_embeds.append(
+                target_audio_embeds_lst.append(
                     audio_embeds[idx, :num_audio_tokens[idx], :])
                 idx += 1
-            final_audio_embeds.append(target_audio_embeds)
+
+            final_audio_embeds.append(torch.cat(target_audio_embeds_lst))
+
         return final_audio_embeds
 
-    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
-                                  audio_inputs: MiniCPMOAudioInputs,
-                                  chunk_length: int) -> torch.Tensor:
-        device, dtype = vlm_embedding.device, vlm_embedding.dtype
-        if audio_inputs["type"] == "audio_embeds":
-            audio_embeddings = [
-                item.to(device=device, dtype=dtype)
-                for item in audio_inputs["audio_embeds"]
-            ]
-        else:
-            audio_embeddings = self.get_audio_hidden_states(
-                audio_inputs, chunk_length)[0]
-        if audio_embeddings is None or len(audio_embeddings) == 0:
-            return vlm_embedding
-        audio_bounds = audio_inputs["audio_bounds"]
-        if self.config.chunk_input:
-            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
-                                                               dtype=dtype)
-            audio_start_pos = 0
-            for bound in audio_bounds:
-                audio_len = bound[1] - bound[0]
-                vlm_embedding[bound[0]:bound[1]] = audio_embs[
-                    audio_start_pos:audio_start_pos + audio_len, :]
-                audio_start_pos += audio_len
-        else:
-            for embs, bound in zip(audio_embeddings, audio_bounds):
-                audio_indices = torch.arange(bound[0],
-                                             bound[1],
-                                             dtype=torch.long).to(device)
-
-                if embs.shape[0] != len(audio_indices):
-                    raise ValueError(
-                        "Shape mismatch: Trying to assign embeddings "
-                        f"of shape {embs.shape} "
-                        f"to input indices of length {len(audio_indices)}")
-                vlm_embedding[audio_indices] = embs.to(dtype)
-        return vlm_embedding
-
-    def _get_audio_bounds(self, input_ids: torch.Tensor,
-                          audio_start_id: torch.Tensor,
-                          audio_end_id: torch.Tensor) -> torch.Tensor:
-        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
-        audio_start_tokens += 1
-        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
-        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
-        return torch.hstack([
-            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
-            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
-        ])
-
-    def _parse_and_validate_audio_inputs(
-            self, input_ids: torch.Tensor,
-            **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
-        audio_start_id = kwargs.pop("audio_start_id")
-        if not isinstance(audio_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_start_id. "
-                             f"Got type: {type(audio_start_id)}")
-
-        audio_end_id = kwargs.pop("audio_end_id")
-        if not isinstance(audio_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_end_id. "
-                             f"Got type: {type(audio_end_id)}")
+        audio_token_id = kwargs.pop("audio_token_id")
+        if audio_token_id is not None:
+            assert isinstance(audio_token_id, torch.Tensor)
+            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
                                  f"Got type: {type(audio_embeds)}")
 
+            audio_embeds_flat = flatten_bn(audio_embeds)
+
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
-                audio_embeds=flatten_bn(flatten_2d_lists(audio_embeds),
-                                        concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
+                audio_embeds=audio_embeds_flat,
             )
 
-        if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_features. "
-                                 f"Got type: {type(audio_features)}")
+        if not isinstance(audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_features. "
+                             f"Got type: {type(audio_features)}")
 
-            audio_feature_lens = kwargs.pop("audio_feature_lens")
-            if not isinstance(audio_feature_lens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_feature_lens. "
-                                 f"Got type: {type(audio_feature_lens)}")
+        audio_feature_lens = kwargs.pop("audio_feature_lens")
+        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_feature_lens. "
+                             f"Got type: {type(audio_feature_lens)}")
 
-            return MiniCPMOAudioFeatureInputs(
-                type="audio_features",
-                audio_features=flatten_bn(audio_features, concat=True),
-                audio_feature_lens=flatten_bn(
-                    flatten_2d_lists(audio_feature_lens), concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
-            )
+        audio_features_flat = flatten_bn(audio_features)
+        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
 
-        raise AssertionError("This line should be unreachable.")
-
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        image_inputs = self._parse_and_validate_image_inputs(
-            input_ids, **kwargs)
-        if not any("audio" in key for key in kwargs):
-            return image_inputs, None
-        audio_inputs = self._parse_and_validate_audio_inputs(
-            input_ids, **kwargs)
-        return image_inputs, audio_inputs
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: Any,
-    ) -> torch.Tensor:
-        if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs, audio_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
-
-            if audio_inputs is not None:
-                vlm_embeddings = self.get_embedding_with_audios(
-                    vlm_embeddings, audio_inputs,
-                    self.config.audio_chunk_length)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
-
-        output = self.llm.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+        return MiniCPMOAudioFeatureInputs(
+            type="audio_features",
+            audio_features=audio_features_flat,
+            audio_feature_lens=audio_feature_lens_flat,
         )
-        return output
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = super()._parse_and_validate_multimodal_inputs(**kwargs)
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("audio_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_audio_input(
+        self,
+        audio_input: MiniCPMOAudioInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["audio_embeds"]
+
+        return self.get_audio_hidden_states(audio_input)
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        multimodal_embeddings = super()._process_multimodal_inputs(modalities)
+
+        for modality in modalities:
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_features = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(audio_features)
+
+        return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 23c010c63d558..1a91cf9bab478 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -23,17 +23,15 @@
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
-import re
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
 import torch.types
-from PIL import Image
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
@@ -51,8 +49,7 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, NestedTensors,
-                                    PlaceholderRange)
+                                    NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -60,20 +57,20 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import flatten_2d_lists
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
-                         SupportsV0Only)
-from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
-CPU_DEVICE = torch.device("cpu")
-
-RawImageType = Union[Image.Image, torch.Tensor]
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
 
 
 class MiniCPMVImagePixelInputs(TypedDict):
@@ -86,13 +83,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_images * num_slices, 2)`
-
-    This should be in `(start, stop)` format.
-    """
-
     tgt_sizes: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_slices, 2)`
@@ -100,25 +90,20 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
+    num_slices: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
 
 class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
+    image_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 
-             image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_images * num_slices, 2)`
-
-    This should be in `(start, stop)` format.
-    """
-
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -233,6 +218,12 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
 
 
 def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    pixel_values = hf_inputs.get("pixel_values", torch.empty(0))
+    num_images = len(pixel_values)
+
+    video_pixel_values = hf_inputs.get("video_pixel_values", torch.empty(0))
+    num_videos = len(video_pixel_values)
+
     return dict(
         pixel_values=MultiModalFieldConfig.batched("image"),
         image_sizes=MultiModalFieldConfig.batched("image"),
@@ -242,6 +233,8 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
+        image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
@@ -297,7 +290,7 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMVImageEmbeddingItems(
                 data,
@@ -309,7 +302,7 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMVVideoEmbeddingItems(
                 data,
@@ -348,108 +341,128 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         return get_version_by_config(self.get_hf_config())
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        mm_limits = {"image": None}
         if self.get_model_version() == (2, 6):
-            return {"image": None, "video": None}
-        else:
-            return {"image": None}
+            mm_limits["video"] = None
 
-    def get_mm_max_tokens_per_item(
+        return mm_limits
+
+    def get_slice_image_placeholder(
         self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        mm_max_tokens = {"image": self.get_max_image_tokens()}
-        if self.get_model_version() == (2, 6):
-            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
-        return mm_max_tokens
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        image_idx: int = 0,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> str:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
 
-    def get_max_video_frame_tokens(self) -> int:
-        frame_size = self.get_video_frame_size_with_most_features()
-        return self.get_num_image_tokens(frame_size,
-                                         self.get_video_max_slice_num())
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_slice_image_placeholder(image_size)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
-        return self.get_max_video_frame_tokens(
-        ) * self.get_num_frames_with_most_features(seq_len)
+        return image_processor.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
 
-    def get_slice_query_num(self) -> int:
-        hf_config = self.get_hf_config()
-        query_num = getattr(hf_config, "query_num", 64)
-        return query_num
+    def get_sliced_grid(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        max_slice_nums: Optional[int] = None,
+    ) -> Optional[tuple[int, int]]:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
 
-    def get_max_slice_num(self) -> int:
-        hf_config = self.get_hf_config()
-        max_slice_num = getattr(hf_config, "max_slice_num", 9)
-        return max_slice_num
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_sliced_grid(image_size)
 
-    def get_sliced_grid(self, image_size: ImageSize,
-                        max_slice_num: int) -> Tuple[int, int]:
-        if self.get_model_version() == (2, 6):
-            slice_grid = self.get_image_processor().get_sliced_grid(
-                image_size, max_slice_num)
+        if max_slice_nums is None:
+            max_slice_nums = image_processor.max_slice_nums
+
+        return image_processor.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+
+    def get_num_image_tokens(
+        self,
+        image_size: ImageSize,
+        max_slice_nums: Optional[int] = None,
+    ) -> int:
+        image_processor = self.get_image_processor()
+
+        grid = self.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+        if grid is None:
+            ncols = nrows = 0
         else:
-            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
-        return slice_grid
+            ncols, nrows = grid
 
-    def get_num_image_tokens(self, image_size: ImageSize,
-                             max_slice_num: int) -> int:
-        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
-        num_tokens = self.get_slice_query_num(
-        ) + 2  # <image>(<unk> * query_num)</image>
-        if slice_grid is not None:
-            if self.get_model_version() == (2, 6):
-                num_additional_tokens = 0
-            else:
-                # <slice><image>(<unk> * query_num)</image></slice>
-                num_additional_tokens = 2
-            num_tokens += ((self.get_slice_query_num() + 2) \
-                            * slice_grid[0] * slice_grid[1]) \
-                            + slice_grid[1] - 1 + num_additional_tokens
-        return num_tokens
-
-    def get_image_slice_nums(self, image_size: torch.Tensor,
-                             max_slice_nums: int) -> int:
-        grid = self.get_sliced_grid(image_size, max_slice_nums)
-        return 1 if grid is None else grid[0] * grid[1] + 1
+        return (ncols * nrows + 1) * image_processor.image_feature_size
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
+        return self.get_num_image_tokens(image_size)
+
+    def get_image_max_slice_num(self) -> int:
+        return getattr(self.get_hf_config(), "max_slice_num", 9)
 
     def get_image_size_with_most_features(self) -> ImageSize:
-        # Result in the max possible feature size (h:w = 9:1)
-        return self.get_default_image_sizes(self.get_max_slice_num())
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_image_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            frame_size,
+            max_slice_nums=self.get_video_max_slice_num(),
+        )
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
+        num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
+        return num_video_tokens_total
 
     def get_video_max_slice_num(self) -> int:
         return 1
 
     def get_video_frame_size_with_most_features(self) -> ImageSize:
-        return self.get_default_image_sizes(self.get_video_max_slice_num())
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_video_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
 
     def get_max_video_frames(self, max_tokens: int) -> int:
         num_frame_tokens = self.get_max_video_frame_tokens()
         num_frames = max_tokens // num_frame_tokens
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
-
-    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
-        image_size = getattr(self.get_hf_config(), "image_size", 448)
-        return ImageSize(width=image_size, height=image_size * num_slices)
+        return max(max_frames_per_video, 1)
 
 
 _I = TypeVar("_I",
@@ -459,11 +472,20 @@ _I = TypeVar("_I",
 
 class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return image_prompt_texts + video_prompt_texts
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -472,9 +494,9 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         video_width, video_height = \
             self.info.get_video_frame_size_with_most_features()
         num_video_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=image_width,
                                    height=image_height,
@@ -486,56 +508,29 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
             ] * num_videos,
         }
 
-        image_prompt_texts = self.info.image_pattern * num_images
-        video_prompt_texts = self.info.video_pattern * num_videos
-
-        return ProcessorInputs(prompt_text=image_prompt_texts +
-                               video_prompt_texts,
-                               mm_data=mm_data)
-
 
 class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMVMultiModalDataParser()
 
-    def get_slice_image_placeholder(self, image_size: ImageSize,
-                                    **kwargs) -> str:
-        image_processor = self.info.get_image_processor()
-        version = self.info.get_model_version()
-        if version == (2, 0) or version == (2, 5):
-            return image_processor.get_slice_image_placeholder(image_size)
-        return image_processor.get_slice_image_placeholder(
-            image_size, **kwargs)
-
     def get_image_prompt_texts(self,
                                image_size: ImageSize,
                                image_idx: int = 0) -> str:
-        return self.get_slice_image_placeholder(image_size,
-                                                image_idx=image_idx)
+        return self.info.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+        )
 
     def get_video_prompt_texts(self, image_size: ImageSize,
                                num_frames: int) -> str:
-        return self.get_slice_image_placeholder(
+        return self.info.get_slice_image_placeholder(
             image_size=image_size,
             image_idx=0,
             max_slice_nums=self.info.get_video_max_slice_num(),
             use_image_id=False,
         ) * num_frames
 
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
-        tokenizer = self.info.get_tokenizer()
-
-        special_tokens = {
-            "im_start_id": tokenizer.im_start_id,
-            "im_end_id": tokenizer.im_end_id,
-        }
-        if hasattr(tokenizer, "slice_start_id"):
-            special_tokens["slice_start_id"] = tokenizer.slice_start_id
-            special_tokens["slice_end_id"] = tokenizer.slice_end_id
-
-        return {k: torch.tensor(v) for k, v in special_tokens.items()}
-
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -546,14 +541,24 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         parsed_images = (self._get_data_parser().parse_mm_data({
             "image": images
-        }).get_items("image", ImageProcessorItems))
+        }).get_items("image",
+                     (MiniCPMVImageEmbeddingItems, ImageProcessorItems)))
 
-        return self._base_call_hf_processor(
-            prompts=[self.info.image_pattern] * len(parsed_images),
-            mm_data={"images": [[image] for image in parsed_images]},
-            mm_kwargs=mm_kwargs,
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        if isinstance(parsed_images, MiniCPMVImageEmbeddingItems):
+            image_inputs = {}
+        else:
+            image_inputs = self._base_call_hf_processor(
+                prompts=[self.info.image_pattern] * len(parsed_images),
+                mm_data={"images": [[image] for image in parsed_images]},
+                mm_kwargs=mm_kwargs,
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        image_inputs["image_token_id"] = torch.tensor(unk_token_id)
+
+        return image_inputs
 
     def process_videos(
         self,
@@ -565,25 +570,33 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         parsed_videos = (self._get_data_parser().parse_mm_data({
             "video": videos
-        }).get_items("video", VideoProcessorItems))
+        }).get_items("video",
+                     (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)))
 
-        max_slice_num = self.info.get_video_max_slice_num()
+        if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems):
+            video_inputs = {}
+        else:
+            video_inputs = self._base_call_hf_processor(
+                prompts=[
+                    self.info.image_pattern * len(video)
+                    for video in parsed_videos
+                ],
+                mm_data={"images": list(parsed_videos)},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "max_slice_nums":
+                    self.info.get_video_max_slice_num(),
+                },
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
 
-        video_inputs = self._base_call_hf_processor(
-            prompts=[
-                self.info.image_pattern * len(video) for video in parsed_videos
-            ],
-            mm_data={"images": list(parsed_videos)},
-            mm_kwargs={
-                **mm_kwargs, "max_slice_nums": max_slice_num
-            },
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
-        return {f"video_{k}": v for k, v in video_inputs.items()}
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video)>./</\1>\)"
+        return video_inputs
 
     def process_mm_inputs(
         self,
@@ -602,7 +615,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_kwargs: Mapping[str, object],
         *,
         out_keys: set[str],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
         if self.info.get_model_version() == (2, 6):
             inputs = super()._call_hf_processor(
@@ -635,14 +648,13 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # Do not support combination inputs of images and videos for now
-        # Try to handle interleaved multimodal data
         tokenizer = self.info.get_tokenizer()
+
+        input_ids = torch.tensor([tokenizer.encode(prompt)])
         mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs)
 
         return BatchFeature({
-            "input_ids":
-            torch.tensor([tokenizer.encode(prompt)]),
+            "input_ids": input_ids,
             **mm_inputs,
         })
 
@@ -671,7 +683,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
             image_size = images.get_image_size(item_idx)
 
-            return self.get_image_prompt_texts(image_size, item_idx)
+            return PromptUpdateDetails.select_text(
+                self.get_image_prompt_texts(image_size, item_idx),
+                "<unk>",
+            )
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -680,7 +695,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return self.get_video_prompt_texts(frame_size, num_frames)
+            return PromptUpdateDetails.select_text(
+                self.get_video_prompt_texts(frame_size, num_frames),
+                "<unk>",
+            )
 
         get_replacement = {
             "image": get_image_replacement,
@@ -701,39 +719,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
     ) -> Mapping[str, MultiModalFieldConfig]:
         return _minicpmv_field_config(hf_inputs)
 
-    def apply(
-        self,
-        prompt: Union[str, List[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        return_mm_hashes: bool = False,
-    ) -> MultiModalInputs:
-        if isinstance(prompt, list):
-            prompt = self.info.get_tokenizer().decode(prompt)
-        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
-        mm_orders = {
-            f"{modality}_orders":
-            torch.tensor(
-                [index for index, m in enumerate(matches) if m == modality])
-            for modality in self.info.get_supported_mm_limits()
-        }
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               return_mm_hashes)
-        # Exclude <image_id>x</image_id> from placeholders
-        if "image" in result["mm_placeholders"] and \
-            self.info.get_model_version() == (2, 6):
-            result["mm_placeholders"]["image"] = [
-                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
-                                 length=p["length"] - 3 - idx // 10)
-                for idx, p in enumerate(result["mm_placeholders"]["image"])
-            ]
-        result["mm_kwargs"].update(**mm_orders)
-        result["mm_kwargs"].update(**self.get_special_tokens())
-        return result
 
-
-class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
-                        SupportsV0Only):
+class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
     instantiated.
@@ -767,6 +754,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
                                              prefix=maybe_prefix(
                                                  prefix, "resampler"))
 
+        self.mm_token_ids = set[int]()
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
@@ -777,233 +765,179 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         return get_sampler()
 
-    def get_embedding_with_vision(
+    def _parse_and_validate_vision_input(
         self,
-        input_ids: torch.Tensor,
-        image_inputs: Optional[MiniCPMVImageInputs],
-    ) -> torch.Tensor:
-        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
-
-        if image_inputs is None:
-            return vlm_embedding
-
-        if image_inputs["type"] == "image_embeds":
-            vision_hidden_states = image_inputs["image_embeds"].to(
-                device=vlm_embedding.device,
-                dtype=vlm_embedding.dtype,
-            )
-        else:
-            vision_hidden_states = self.get_vision_hidden_states(image_inputs)
-
-        # See NOTE in _parse_and_validate_inputs
-        image_bounds = image_inputs["image_bounds"]
-        if len(image_bounds) > 0:
-            image_indices = torch.stack([
-                torch.arange(start, end, dtype=torch.long)
-                for start, end in image_bounds.tolist()
-            ]).to(vlm_embedding.device)
-
-            vlm_embedding.scatter_(
-                0,
-                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
-                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
-            )
-
-        return vlm_embedding
-
-    def _get_image_bounds(
-            self,
-            input_ids: torch.Tensor,
-            im_start_id: torch.Tensor,
-            im_end_id: torch.Tensor,
-            slice_start_id: Optional[torch.Tensor] = None,
-            slice_end_id: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # All the images in the batch should share the same special image
-        # bound token ids.
-        start_cond = input_ids == im_start_id[0]
-        end_cond = input_ids == im_end_id[0]
-        if slice_start_id is not None:
-            start_cond |= (input_ids == slice_start_id[0])
-            end_cond |= (input_ids == slice_end_id[0])
-
-        image_start_tokens, = torch.where(start_cond)
-        image_start_tokens += 1
-        image_end_tokens, = torch.where(end_cond)
-        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
-
-        if valid_image_nums == 0:
-            return torch.zeros((0, 2), device=input_ids.device)
-
-        return torch.hstack([
-            image_start_tokens[:valid_image_nums].unsqueeze(-1),
-            image_end_tokens[:valid_image_nums].unsqueeze(-1),
-        ])
-
-    def _parse_and_validate_image_inputs(
-        self,
-        input_ids: torch.Tensor,
+        modality: str,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        image_keys = {"pixel_values", "tgt_sizes"}
-        pixel_data = {
-            "image": {
-                key: kwargs.pop(key, None)
-                for key in image_keys
-            },
-            "video": {
-                key: kwargs.pop("video_" + key, None)
-                for key in image_keys
-            }
-        }
-        embed_data = {
-            "image": kwargs.pop("image_embeds", None),
-            "video": kwargs.pop("video_embeds", None),
-        }
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
 
-        all_pixel_data = [
-            v for vs in pixel_data.values() for v in vs.values()
-            if v is not None
-        ]
-        all_embed_data = [v for v in embed_data.values() if v is not None]
-        if len(all_pixel_data) == 0 and len(all_embed_data) == 0:
+        if pixel_values is None and image_embeds is None:
             return None
 
-        im_start_id = kwargs.pop("im_start_id")
-        if not isinstance(im_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_start_id. "
-                             f"Got type: {type(im_start_id)}")
+        image_token_id = kwargs.pop("image_token_id")
+        if image_token_id is not None:
+            assert isinstance(image_token_id, torch.Tensor)
+            self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
-        im_end_id = kwargs.pop("im_end_id")
-        if not isinstance(im_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_end_id. "
-                             f"Got type: {type(im_end_id)}")
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image_embeds for {modality=}. "
+                    f"Got type: {type(image_embeds)}")
 
-        slice_start_id = kwargs.pop("slice_start_id", None)
-        if slice_start_id is not None and not isinstance(
-                slice_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of slice_start_id. "
-                             f"Got type: {type(slice_start_id)}")
-
-        slice_end_id = kwargs.pop("slice_end_id", None)
-        if slice_end_id is not None and not isinstance(slice_end_id,
-                                                       torch.Tensor):
-            raise ValueError("Incorrect type of slice_end_id. "
-                             f"Got type: {type(slice_end_id)}")
-
-        if len(all_embed_data) > 0:
-            if len(all_embed_data) > 1:
-                raise ValueError("Incorrect inputs for vision embeddings. "
-                                 "Image embeds and video embeds can not "
-                                 "exist simultaneously.")
-
-            vision_embeds, = all_embed_data
-            if not isinstance(vision_embeds, (torch.Tensor, list)):
-                raise ValueError(f"Incorrect type of vision_embeds. "
-                                 f"Got type: {type(vision_embeds)}")
+            image_embeds_flat = flatten_bn(image_embeds)
 
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
-                image_embeds=flatten_bn(flatten_2d_lists(vision_embeds),
-                                        concat=True),
-                image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                    im_end_id, slice_start_id,
-                                                    slice_end_id),
+                image_embeds=image_embeds_flat,
             )
 
-        order_data = dict[str, Union[torch.Tensor, list[torch.Tensor]]]()
-        for modality in ("image", "video"):
-            modality_orders = kwargs.pop(f"{modality}_orders", None)
-            if modality_orders is not None:
-                if not isinstance(modality_orders, (torch.Tensor, list)):
-                    raise ValueError(f"Incorrect type of {modality}_orders. "
-                                     f"Got type: {type(modality_orders)}")
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of pixel_values for {modality=}. "
+                f"Got type: {type(pixel_values)}")
 
-                order_data[modality] = modality_orders
+        tgt_sizes = kwargs.pop("tgt_sizes")
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of tgt_sizes for {modality=}. "
+                             f"Got type: {type(tgt_sizes)}")
 
-        batch_sizes = {
-            modality: len(modality_orders)
-            for modality, modality_orders in order_data.items()
-        }
-        unique_batch_sizes = set(batch_sizes.values())
-        assert len(unique_batch_sizes) == 1, (
-            f"Found inconsistent batch sizes: {batch_sizes}")
-        batch_size, = unique_batch_sizes
+        num_slices = [[len(p) for p in ps] for ps in pixel_values]
+        num_slices_flat = flatten_bn(torch.tensor(num_slices))
 
-        pixel_values_flat = list[torch.Tensor]()
-        tgt_sizes_flat = list[torch.Tensor]()
-        for b in range(batch_size):
-            mm_orders_b = [(idx_b.item(), modality)
-                           for modality, modality_orders in order_data.items()
-                           for idx_b in modality_orders[b]]
+        pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
+        tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
 
-            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
-                modality_pixel_data = pixel_data[modality]
-
-                modality_pixel_values = modality_pixel_data["pixel_values"]
-                if not isinstance(modality_pixel_values, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of pixel_values for {modality=}. "
-                        f"Got type: {type(modality_pixel_values)}")
-
-                modality_tgt_sizes = modality_pixel_data["tgt_sizes"]
-                if not isinstance(modality_tgt_sizes, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of tgt_sizes for {modality=}. "
-                        f"Got type: {type(modality_tgt_sizes)}")
-
-                pixel_values_flat += flatten_2d_lists(modality_pixel_values[b])
-                tgt_sizes_flat += flatten_2d_lists(modality_tgt_sizes[b])
-
-        # NOTE: Input IDs does not contain image tokens during memory profiling,
-        # so we allow it to be empty
         if len(pixel_values_flat) != len(tgt_sizes_flat):
             raise ValueError("Inconsistent flattened lengths, found: "
                              f"{len(pixel_values_flat)} vs. "
                              f"{len(tgt_sizes_flat)}")
 
-        if len(pixel_values_flat) == 0:
-            return None
-
         return MiniCPMVImagePixelInputs(
             type="pixel_values",
             pixel_values=pixel_values_flat,
-            tgt_sizes=torch.stack(tgt_sizes_flat),
-            image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                im_end_id, slice_start_id,
-                                                slice_end_id),
+            tgt_sizes=tgt_sizes_flat,
+            num_slices=num_slices_flat,
         )
 
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_vision_input(
+                    "images", **kwargs)
+            if input_key in ("video_pixel_values",
+                             "video_embeds") and "videos" not in modalities:
+
+                def _image_key(video_key: str):
+                    if video_key == "video_token_id":
+                        return "image_token_id"
+
+                    return video_key.removeprefix("video_")
+
+                modalities["videos"] = self._parse_and_validate_vision_input(
+                    "videos", **{
+                        _image_key(k): v
+                        for k, v in kwargs.items()
+                    })
+
+        return modalities
+
+    def _process_vision_input(
+        self,
+        image_input: MiniCPMVImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
+
+        image_features_flat = self.get_vision_hidden_states(image_input)
+
+        num_slices = image_input["num_slices"]
+        return [
+            e.flatten(0, 1)
+            for e in image_features_flat.split(num_slices.tolist())
+        ]
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                image_features = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(image_features)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_features = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(video_features)
+
+        return multimodal_embeddings
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        return self._process_multimodal_inputs(modalities)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert len(self.mm_token_ids) > 0
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                list(self.mm_token_ids),
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
+            inputs_embeds = None
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
-        output = self.llm.model(
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.llm.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+            inputs_embeds=inputs_embeds,
         )
-        return output
+        return hidden_states
 
     def compute_logits(
         self,
@@ -1105,9 +1039,6 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
 
         return model
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_tokens(input_ids)
-
     def init_resampler(self,
                        embed_dim: int,
                        vision_dim: int,
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
new file mode 100644
index 0000000000000..c95cbb419eb95
--- /dev/null
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import torch
+
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MinimaxCacheParams:
+    minimax_cache: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
+                                  self.state_indices_tensor)
+
+
+class MinimaxCacheManager(ConstantSizeCache):
+
+    def __init__(self, dtype, cache_shape):
+        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
+        self._minimax_cache = torch.empty(size=cache_shape,
+                                          dtype=dtype,
+                                          device="cuda")
+
+    @property
+    def cache(self):
+        return self._minimax_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        assert len(self.cache) > 0
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
new file mode 100644
index 0000000000000..7562aa678d5ab
--- /dev/null
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -0,0 +1,1273 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only MiniMaxText01 model."""
+import copy
+import math
+import re
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+def replace_weight_name(name: str,
+                        key: str = None,
+                        to: str = None,
+                        count: int = None,
+                        prefix: str = None) -> str:
+    name = name.replace(key, to) if count is None else \
+        name.replace(key, to, count)
+    return name
+
+
+def weight_loader_with_alias(alias: str):
+
+    def wrapper(func: callable):
+
+        def inner_func(param: torch.Tensor,
+                       loaded_weight: torch.Tensor,
+                       *args,
+                       prefix: str = None,
+                       **kwargs):
+            value = func(param, loaded_weight, *args, **kwargs)
+            return value
+
+        return inner_func
+
+    return wrapper
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01RotaryEmbedding(CustomOp):
+    name = "MiniMaxText01RotaryEmbedding"
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position: int,
+        base: int,
+        is_neox_style: bool,
+        cache_dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.cache_dtype = cache_dtype
+        cache = self._compute_cos_sin_cache().to(cache_dtype)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(
+        self,
+        base: Union[int, float],
+    ) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        query_cast = query.to(self.cache_dtype)
+        key_cast = key.to(self.cache_dtype)
+        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
+        query = query_cast.to(query.dtype)
+        key = key_cast.to(key.dtype)
+        return query, key
+
+
+class MiniMaxText01MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        prefix: str = "mlp",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+        return
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniMaxText01MoE(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        layer_idx: int = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "moe",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // self.tp_size
+        self.quant_config = quant_config
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.weight.weight_loader = MiniMaxText01MoE.gate_weight_loader
+
+        self.experts = FusedMoE(
+            num_experts=self.num_total_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size * self.tp_size,
+            params_dtype=self.params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=self.quant_config,
+            tp_size=self.tp_size,
+            prefix=f"{prefix}.experts",
+        )
+        return
+
+    @staticmethod
+    def gate_weight_loader(param: nn.Parameter,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits_fp32, _ = self.gate(hidden_states.to(torch.float32))
+        final_hidden_states = self.experts(
+            hidden_states, router_logits_fp32.to(hidden_states.dtype))
+        final_hidden = final_hidden_states.view(num_tokens, hidden_size)
+        return final_hidden
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: int = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            _start = attn_metadata.query_start_loc[_prefill_idx]
+            _end = attn_metadata.query_start_loc[_prefill_idx + 1]
+            slot_id = state_indices_tensor[_prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slot_id = state_indices_tensor[_prefill_idx]
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden.append(
+                self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
+                                   attn_metadata))
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0
+                                               ):]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        kv_cache = kv_caches.minimax_cache
+        state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if not decode_only:
+            hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                 state_indices_tensor,
+                                                 attn_metadata)
+        else:
+            hidden = self._decode_infer(q, k, v, kv_cache,
+                                        state_indices_tensor, attn_metadata)
+
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states)
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+        hidden, _ = self.out_proj(hidden)
+        return hidden
+
+
+class MiniMaxText01Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        sliding_window: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "mha",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        return
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = attn_metadata.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxText01DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        expert_num: int = 1,
+        layer_id: int = None,
+        linear_layer_id: Optional[int] = None,
+        prefix: str = "decoder",
+    ) -> None:
+        self._ilayer = layer_id
+        self._irank = get_tensor_model_parallel_rank()
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.expert_num = expert_num
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        if config.attention_type == 0:
+            use_headxdim = True
+            hidden_inner = (head_dim * config.num_attention_heads
+                            if use_headxdim else config.hidden_size)
+            self.self_attn = MiniMaxText01LinearAttention(
+                hidden_size=self.hidden_size,
+                hidden_inner_size=hidden_inner,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                max_position=max_position_embeddings,
+                block_size=config.block if hasattr(config, "block") else 256,
+                num_hidden_layer=config.num_hidden_layers,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                linear_layer_idx=linear_layer_id,
+                prefix=prefix)
+        elif config.attention_type == 1:
+            self.self_attn = MiniMaxText01Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                rotary_dim=config.rotary_dim
+                if hasattr(config, "rotary_dim") else head_dim,
+                num_kv_heads=config.num_key_value_heads,
+                max_position=max_position_embeddings,
+                rope_theta=rope_theta,
+                sliding_window=config.sliding_window,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                cache_config=cache_config,
+                prefix=prefix)
+        else:
+            raise ValueError(
+                f"Unsupported attention type: {self.config.attention_type}")
+
+        if expert_num == 1:
+            self.mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+        else:
+            self.block_sparse_moe = MiniMaxText01MoE(
+                num_experts=expert_num,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_idx=self._ilayer,
+                quant_config=quant_config,
+                prefix=prefix)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        if config.attention_type == 0:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_linear_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_linear_attention_beta', 1)
+        else:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_full_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_full_attention_beta', 1)
+        self.layernorm_mlp_alpha = getattr(config, 'layernorm_mlp_alpha', 1)
+        self.layernorm_mlp_beta = getattr(config, 'layernorm_mlp_beta', 1)
+        self.postnorm = getattr(config, 'postnorm', False)
+        self.shared_moe = False
+
+        shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        if shared_intermediate > 0:
+            self.shared_moe = True
+            self.shared_mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=shared_intermediate,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+            self.coefficient = ReplicatedLinear(
+                self.hidden_size,
+                1,
+                bias=False,
+                quant_config=quant_config,
+                params_dtype=torch.float32,
+            )
+            self.coefficient.weight.weight_loader = (
+                self.shared_moe_coefficient_loader)
+            self.shared_moe_mode = getattr(config, 'shared_moe_mode',
+                                           'softmax')
+        return
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: Union[List[Dict], Optional[torch.Tensor]],
+                attn_metadata: AttentionMetadata,
+                residual: Optional[torch.Tensor],
+                is_warmup: bool = False,
+                **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        layernorm_input = hidden_states
+        layernorm_output = self.input_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+        self_attention_output = self.self_attn(
+            hidden_states=layernorm_output,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+        residual = residual * self.layernorm_attention_alpha
+        self_attention_output = (self_attention_output *
+                                 self.layernorm_attention_beta)
+
+        layernorm_input = residual + self_attention_output
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+
+        if self.expert_num == 1:
+            hidden_states = self.mlp(layernorm_output)
+        else:
+            moe_hidden_states = self.block_sparse_moe(
+                copy.deepcopy(layernorm_output))
+            if self.shared_moe:
+                before_moe_dtype = layernorm_output.dtype
+                moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
+                output_mlp = self.shared_mlp(layernorm_output).to(
+                    torch.float32)
+
+                coef, _ = self.coefficient(layernorm_output.to(torch.float32))
+
+                if self.shared_moe_mode == 'softmax':
+                    coef = torch.nn.functional.softmax(coef, dim=-1)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+                elif self.shared_moe_mode == 'sigmoid':
+                    coef = torch.nn.functional.sigmoid(coef)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+
+                hidden_states = hidden_states.to(before_moe_dtype)
+            else:
+                hidden_states = moe_hidden_states
+
+        residual = residual * self.layernorm_mlp_alpha
+        hidden_states = hidden_states * self.layernorm_mlp_beta
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+    @staticmethod
+    def shared_moe_coefficient_loader(param: torch.Tensor,
+                                      loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+
+class MiniMaxText01Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        scheduler_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.decoder_attention_types = getattr(
+            config, "attn_type_list", False) or getattr(
+                config, "decoder_attention_types", False)
+        if not self.decoder_attention_types:
+            self.decoder_attention_types = [1] * config.num_hidden_layers
+        self.num_layers = config.num_hidden_layers
+
+        self._layer_barrier = False
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def layer_fn(prefix):
+            layer_idx = int(prefix.split('.')[-1])
+            layer_config = config
+            layer_config.attention_type = self.decoder_attention_types[
+                layer_idx]
+            layer_config.layer_idx = layer_idx
+
+            decoder_kwargs = {
+                "quant_config": quant_config,
+                "layer_id": layer_idx,
+                "cache_config": cache_config
+            }
+
+            if layer_config.attention_type == 0:
+                decoder_kwargs["linear_layer_id"] = sum(
+                    1 for i in range(layer_idx)
+                    if self.decoder_attention_types[i] == 0)
+            else:
+                decoder_kwargs["linear_layer_id"] = None
+
+            if hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, list):
+                decoder_kwargs["expert_num"] = config.num_local_experts[
+                    layer_idx]
+            elif hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, int):
+                decoder_kwargs["expert_num"] = config.num_local_experts
+            else:
+                decoder_kwargs["expert_num"] = 1
+
+            return MiniMaxText01DecoderLayer(layer_config,
+                                             **decoder_kwargs,
+                                             prefix=prefix)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers")
+
+        linear_layer_nums = sum(1 for i in range(config.num_hidden_layers)
+                                if self.decoder_attention_types[i] == 0)
+        max_slots_number = scheduler_config.max_num_seqs
+        self.cache_shape = (linear_layer_nums, max_slots_number,
+                            config.num_attention_heads //
+                            get_tensor_model_parallel_world_size(),
+                            config.head_dim, config.head_dim)
+        _dummy = torch.zeros(1)
+        self._dtype = _dummy.dtype
+        del _dummy
+
+        self.minimax_cache = MinimaxCacheManager(dtype=self._dtype,
+                                                 cache_shape=self.cache_shape)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        self.rotary_emb = MiniMaxText01RotaryEmbedding(
+            head_dim,
+            rotary_dim=config.rotary_dim
+            if hasattr(config, "rotary_dim") else head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            is_neox_style=True,
+            cache_dtype=torch.float32,
+        )
+
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer()
+        self.embed_scale = 1.0
+        return
+
+    def _clear_prefill_cache(self, attn_metadata,
+                             minimax_cache_tensors: torch.Tensor, **kwargs):
+        seq_to_slot_maps = {}
+        seq_id_map = sum(list(kwargs["request_ids_to_seq_ids"].values()), [])
+        for _, seq_to_slot_map in (
+                self.minimax_cache.cache_indices_mapping.items()):
+            seq_to_slot_maps.update(seq_to_slot_map)
+
+        slots_to_clear = []
+        for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)):
+            seq_id = seq_id_map[_prefill_id]
+            if attn_metadata.context_lens_tensor[
+                    _prefill_id] == 0 and seq_id in seq_to_slot_maps:
+                slots_to_clear.append(seq_to_slot_maps[seq_id])
+
+        if slots_to_clear:
+            slots_tensor = torch.tensor(slots_to_clear,
+                                        device=minimax_cache_tensors.device,
+                                        dtype=torch.long)
+            minimax_cache_tensors[:, slots_tensor, ...] = 0
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor],
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                intermediate_tensors=None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            return None
+        if "request_ids_to_seq_ids" not in kwargs:
+            kwargs["request_ids_to_seq_ids"] = {}
+        if "finished_requests_ids" not in kwargs:
+            kwargs["finished_requests_ids"] = []
+        (
+            minimax_cache_tensors,
+            state_indices_tensor,
+        ) = self.minimax_cache.current_run_tensors(**kwargs)
+        if getattr(attn_metadata, "num_prefills", 0) > 0:
+            self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
+                                      **kwargs)
+
+        minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
+                                                  state_indices_tensor)
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.embed_scale * self.embed_tokens(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        minimax_cache_index = 0
+        attn_metadata.rotary_emb = self.rotary_emb
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            _caches = None
+            if isinstance(layer.self_attn, MiniMaxText01Attention):
+                _caches = kv_caches[kv_cache_index]
+                kv_cache_index += 1
+            if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
+                current_state_layer = minimax_cache_index
+                _caches = minimax_cache_params.at_layer_idx(
+                    current_state_layer)
+                minimax_cache_index += 1
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                positions=positions,
+                kv_caches=_caches,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
+                               SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        if not hasattr(config, "sliding_window"):
+            config.sliding_window = None
+
+        self.CONCAT_FFN = True
+
+        self.unpadded_vocab_size = self.config.vocab_size
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxText01Model(
+            self.config,
+            quant_config,
+            cache_config=vllm_config.cache_config,
+            scheduler_config=vllm_config.scheduler_config,
+            prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                self.config.hidden_size,
+                org_num_embeddings=self.config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size)
+
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+        flash_layer_count = sum(1 for attn_type in self.config.attn_type_list
+                                if attn_type == 1)
+        self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
+        return
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.model.minimax_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
+            batch_size)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, self.kv_cache,
+                                   intermediate_tensors, inputs_embeds,
+                                   **kwargs)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ):
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        def which_layer(name: str) -> int:
+            if "layers" in name:
+                after_layer = name.split("layers")[-1]
+                return int(after_layer.split(".")[1])
+            return None
+
+        def is_linear_attn_layer(layer_idx: int) -> bool:
+            if layer_idx is None or not hasattr(self.config, "attn_type_list"):
+                return False
+            return self.config.attn_type_list[layer_idx] == 0
+
+        def is_moe_weight(name: str) -> bool:
+            return "block_sparse_moe" in name and not name.endswith(".bias")
+
+        def get_expert_id(param_name):
+            pattern = r'model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\.'
+            match = re.search(pattern, param_name)
+            if match:
+                return match.group(1)
+            return None
+
+        def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if isinstance(self.config.num_local_experts, list):
+                expert_params_mapping = [
+                    ("w13_weight"
+                     if weight_name in ["w1", "w3"] else "w2_weight",
+                     f"experts.{expert_id}.{weight_name}.weight", expert_id)
+                    for expert_id in range(max(self.config.num_local_experts))
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+            else:
+                expert_params_mapping = [
+                    ("w13_scale" if weight_name in ["w1", "w3"] else
+                     "w2_scale", f"{expert_id}.{weight_name}.weight_scale",
+                     expert_id, weight_name)
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ] + [("w13_weight" if weight_name in ["w1", "w3"] else
+                      "w2_weight", f"{expert_id}.{weight_name}.weight",
+                      expert_id, weight_name)
+                     for expert_id in range(self.config.num_local_experts)
+                     for weight_name in ["w1", "w2", "w3"]]
+            for (param_name, weight_name, expert_id,
+                 shard_id) in expert_params_mapping:
+                name_expert_id = get_expert_id(name)
+                if name_expert_id is not None and int(name_expert_id) != int(
+                        expert_id):
+                    continue
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param,
+                              loaded_weight,
+                              weight_name,
+                              expert_id=expert_id,
+                              shard_id=shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_shared_mlp_weight(name: str) -> bool:
+            return "shared_mlp" in name and not name.endswith(".bias")
+
+        def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if not self.CONCAT_FFN:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "w1", 1)
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "w3", 1)
+                elif "down_proj" in name:
+                    name = name.replace("down_proj", "w2", 1)
+            else:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 0
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 1
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            if not self.CONCAT_FFN:
+                weight_loader(param, loaded_weight)
+            else:
+                if "gate_up_proj" in name:
+                    weight_loader(param, loaded_weight, loaded_shard_id)
+                elif "down_proj" in name:
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise AssertionError(
+                        "MLP weight not in [gate_up_proj, down_proj]")
+            return
+
+        def is_mha_weight(name: str) -> bool:
+            return "self_attn" in name and not name.endswith(".bias")
+
+        def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                    self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+
+            weight_loader = getattr(
+                param, "weight_loader",
+                MiniMaxText01LinearAttention.weight_direct_load)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+
+            flash_mha_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+            for (param_name, weight_name,
+                 shard_id) in flash_mha_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_layer_norm_weight(name: str) -> bool:
+            return "norm" in name and not name.endswith(
+                ".bias") and name in params_dict
+
+        def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_basic_weight(name: str, loaded_weight: torch.Tensor,
+                              self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        for name, loaded_weight in weights:
+            weight_at_layer = which_layer(name)
+            if weight_at_layer and weight_at_layer >= len(
+                    self.config.attn_type_list):
+                continue
+
+            if is_layer_norm_weight(name):
+                load_layer_norm_weight(name, loaded_weight, self)
+                continue
+            if is_mha_weight(name):
+                if is_linear_attn_layer(weight_at_layer):
+                    load_linear_attn_weight(name, loaded_weight, self)
+                else:
+                    load_flash_attn_weight(name, loaded_weight, self)
+                continue
+            if is_moe_weight(name):
+                load_sparse_moe_weight(name, loaded_weight, self)
+                continue
+            if is_shared_mlp_weight(name):
+                load_shared_mlp_weight(name, loaded_weight, self)
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            load_basic_weight(name, loaded_weight, self)
+        return
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
new file mode 100644
index 0000000000000..8b1a1d68fc3fa
--- /dev/null
+++ b/vllm/model_executor/models/mistral3.py
@@ -0,0 +1,611 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
+                    TypeVar, Union)
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
+                          PretrainedConfig)
+from transformers.models.pixtral import PixtralProcessor
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
+
+
+class Mistral3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, vision_hidden_size: int, spatial_merge_size: int,
+                 patch_size: int):
+        super().__init__()
+
+        self.vision_hidden_size = vision_hidden_size
+        self.spatial_merge_size = spatial_merge_size
+        self.patch_size = patch_size
+        self.merging_layer = nn.Linear(vision_hidden_size *
+                                       self.spatial_merge_size**2,
+                                       vision_hidden_size,
+                                       bias=False)
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [(image_size[0] // self.patch_size,
+                        image_size[1] // self.patch_size)
+                       for image_size in image_sizes]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(
+                image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0,
+                                                            1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid,
+                kernel_size=self.spatial_merge_size,
+                stride=self.spatial_merge_size)
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 spatial_merge_size: int,
+                 patch_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.norm = RMSNorm(vision_hidden_size, eps=1e-5)
+        self.patch_merger = Mistral3PatchMerger(
+            vision_hidden_size=vision_hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            patch_size=patch_size)
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+
+
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> LlavaLikeConfig:
+        return self.ctx.get_hf_config(Mistral3Config)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    @abstractmethod
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
+
+
+class Mistral3MultiModalProcessor(
+        BaseMultiModalProcessor[Mistral3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
+
+        vision_config = hf_config.vision_config
+        assert isinstance(vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(vision_config)
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_mistral3_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(Mistral3Config)
+    assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+    return Mistral3ProcessingInfo(ctx)
+
+
+def _build_mistral3_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: Optional[ProcessingCache] = None,
+    enable_sanity_checks: bool = True,
+) -> BaseMultiModalProcessor:
+    assert isinstance(info, Mistral3ProcessingInfo)
+    return Mistral3MultiModalProcessor(
+        info,
+        dummy_inputs,  # type: ignore
+        cache=cache,
+        enable_sanity_checks=enable_sanity_checks,
+    )
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+def init_vision_tower_for_llava(
+    hf_config: LlavaLikeConfig,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> PixtralHFVisionModel:
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
+
+    assert isinstance(vision_config, PixtralVisionConfig)
+
+    return PixtralHFVisionModel(
+        vision_config,
+        quant_config=quant_config,
+        num_hidden_layers_override=num_hidden_layers,
+        require_post_norm=require_post_norm,
+        prefix=prefix,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_mistral3_processor,
+    info=_build_mistral3_info,
+    dummy_inputs=Mistral3DummyInputsBuilder)
+class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # NOTE: These are special cases for Pixtral-12B in the HF-format
+        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
+        if (config.text_config.architectures is None
+                and config.text_config.model_type == "mistral"):
+            config.text_config.architectures = ["MistralForCausalLM"]
+        if (config.projector_hidden_act is None
+                and config.vision_config.hidden_act == "gelu"):
+            config.projector_hidden_act = "gelu"
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = Mistral3MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            spatial_merge_size=config.spatial_merge_size,
+            patch_size=config.vision_config.patch_size,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        assert pixel_values is not None
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return Mistral3ImagePixelInputs(
+            type="pixel_values_pixtral",
+            pixel_values=flatten_bn(pixel_values),
+        )
+
+    def _process_image_input(
+        self,
+        image_input: Mistral3ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_sizes = [(img.shape[-2], img.shape[-1])
+                       for img in image_input["pixel_values"]]
+
+        image_features = self.vision_tower(image_input["pixel_values"])
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features, image_sizes)
+
+        feature_sizes = [
+            image_feature.shape[0] // self.config.spatial_merge_size**2
+            for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features),
+                                                  image_sizes)
+        if len(feature_sizes) > 1:
+            image_embeds = torch.split(image_embeds, feature_sizes)
+        else:
+            image_embeds = (image_embeds, )
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Mistral3.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+
+        See also:
+            :class:`Mistral3ImagePixelInputs`
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6bdb623593a71..b0ac99f21ead1 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -49,7 +49,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -260,6 +260,8 @@ class MixtralModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -313,6 +315,98 @@ class MixtralModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -397,95 +491,5 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 5be91f40bb258..96eb925cf894d 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -45,7 +45,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -420,6 +421,11 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            if name.endswith("scale"):
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 9ed49597cf827..7bfb3ada6bb46 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -21,7 +21,6 @@ from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
 import transformers.models.mllama.configuration_mllama as config_mllama
 from PIL.Image import Image
 from torch import nn
@@ -53,16 +52,17 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalEncDecInputs,
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalFieldConfig, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataDict, MultiModalDataItems)
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@@ -107,16 +107,6 @@ class MllamaProcessingInfo(BaseProcessingInfo):
         image_size = self.get_hf_config().vision_config.image_size
         return calc_token_per_chunk(image_size)
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        token_per_chunk = self.get_token_per_chunk_from_config()
-        mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
-        return {"image": mm_max_tokens}
-
     def get_num_tiles_per_image(self, image_height: int,
                                 image_width: int) -> int:
         vision_config = self.get_hf_config().vision_config
@@ -142,31 +132,31 @@ class MllamaProcessingInfo(BaseProcessingInfo):
 
 class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                                 ):
@@ -181,10 +171,10 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
                                   return_mm_hashes)
 
+        image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
         # the number of images provided in mm_data
-        num_image_tokens = mm_inputs['prompt_token_ids'].count(
-            self.info.get_hf_config().image_token_index)
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id)
         image_data = mm_data.get("image", [])
         num_images = 1 if isinstance(image_data, Image) else len(image_data)
         if num_image_tokens != num_images:
@@ -192,8 +182,58 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                 f"The number of image tokens ({num_image_tokens}) must be"
                 f" the same as the number of images ({num_images})")
 
+        # Given prompt: <IMG0> P0 P1 <IMG1> <IMG2> P3 P4 D5 D6...., (P-prefill, D-decode)  # noqa: E501
+        # P0 & P1 do cross attention with placeholder of <IMG0>
+        # P3 P4 D5 D6 do cross attention with placeholder of <IMG1> and <IMG2>
+        # Example input to encoder and decoder:
+        # {
+        #     'encoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128256, 128256, ..., 128256],
+        #         'prompt': '<|image|><|image|>...<|image|>',
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        #     'decoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+        #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        # }
+
+        if mm_data:
+            hf_processor = self.info.get_hf_processor()
+            image_token: str = hf_processor.image_token
+
+            # Since only the last group of consecutive images
+            # are attended by the decoded tokens, we only need to
+            # get the number of tokens for those images.
+            token_per_chunk = self.info.get_token_per_chunk_from_config()
+            num_decode_images = self._get_num_image_in_last_group(
+                mm_inputs["prompt_token_ids"])
+            num_encode_images = num_images - num_decode_images
+
+            # Set encoder prompt length based on the number of tiles.
+            # This tells the block manager to allocate correct number
+            # of slots for encoder tokens.
+            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
+            num_tokens = decode_tiles * token_per_chunk
+            mm_inputs["encoder_prompt_token_ids"] = [image_token_id
+                                                     ] * num_tokens
+            mm_inputs["encoder_prompt"] = image_token * num_tokens
+
         return mm_inputs
 
+    def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int:
+        num_images = 0
+        for token_id in prompt_token_ids[::-1]:
+            if token_id == self.info.get_hf_config().image_token_index:
+                num_images += 1
+            elif num_images > 0:
+                break
+        return num_images
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -211,19 +251,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
             processed_outputs["num_tiles"] = torch.tensor(num_tiles)
             for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
                 processed_outputs[k] = processed_outputs[k].squeeze(0)
-            # Example input to encoder and decoder:
-            # {
-            #     'encoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
-            #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
-            #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-            #     },
-            #     'decoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128000],
-            #     },
-            # }
+
             processed_token_ids = processed_outputs.pop("input_ids")
             start_idx, end_idx = 0, processed_token_ids.size(1)
             processed_prompt_text = tokenizer.decode(processed_token_ids[0])
@@ -1154,6 +1182,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         super().__init__()
         config: MllamaConfig = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
@@ -1201,11 +1230,34 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def unpack_data(self,
+                    image_data: Union[List[torch.Tensor], torch.Tensor],
+                    padding_value=0) -> torch.Tensor:
+        if isinstance(image_data, torch.Tensor):
+            # torch.Tensor
+            return image_data
+        else:
+            assert isinstance(
+                image_data[0],
+                torch.Tensor), "Image data is not properly batched."
+            # List[torch.Tensor]
+            bsz = len(image_data)
+            max_length = max(t.size(0) for t in image_data)
+            trailing_dims = image_data[0].shape[1:]
+            for data in image_data:
+                cur_trailing_dims = data.shape[1:]
+                assert cur_trailing_dims == trailing_dims
+            output_tensor = torch.full((bsz, max_length, *trailing_dims),
+                                       padding_value,
+                                       dtype=image_data[0].dtype,
+                                       device=image_data[0].device)
+            for i, t in enumerate(image_data):
+                output_tensor[i, :t.size(0)] = t
+            return output_tensor
+
     def _parse_and_validate_image_input(self, **kwargs: object):
         # tensor with the same shape will be batched together by
         # MultiModalKwargs.batch, so pixel_values here can be:
-        #   - List[List[torch.Tensor]]:
-        #       with shape (num_tiles, 3, image_res, image_res)
         #   - List[torch.Tensor]:
         #       with shape (num_image, num_tiles, 3, image_res, image_res)
         #   - torch.Tensor:
@@ -1240,16 +1292,40 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
 
             return MllamaImagePixelInputs(
                 type="pixel_values",
-                data=pixel_values,
-                aspect_ratio_ids=aspect_ratio_ids,
-                aspect_ratio_mask=aspect_ratio_mask,
-            )
+                data=self.unpack_data(pixel_values),
+                aspect_ratio_ids=self.unpack_data(aspect_ratio_ids),
+                aspect_ratio_mask=self.unpack_data(aspect_ratio_mask))
 
         if image_embeds is not None:
             raise NotImplementedError
 
         raise AssertionError("This line should be unreachable.")
 
+    def _get_and_validate_encoder_lens(
+        self,
+        encoder_seq_lens: List[int],
+        num_tiles: List[List[int]],
+        num_tokens_per_tile: int,
+    ) -> List[int]:
+        # Get the actual number of encoder tokens for each sample.
+        # Because attn_metadata.encoder_seq_lens only counts the last
+        # group of images for each sample, which is used to cheat the
+        # block manager to allocate blocks for those images only.
+        # See MllamaMultiModalProcessor for more details.
+        actual_encoder_seq_lens = [
+            sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+        ]
+
+        # remove 0 encoder len entries for text-only requests for these
+        # assertions
+        attn_metadata_lens = [x for x in encoder_seq_lens if x > 0]
+        assert len(actual_encoder_seq_lens) == len(attn_metadata_lens)
+        for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                              attn_metadata_lens):
+            assert actual_len >= last_group_len
+
+        return actual_encoder_seq_lens
+
     def flat_encoder_result(self, cross_attention_states: torch.Tensor,
                             attn_metadata: AttentionMetadata,
                             actual_encoder_seq_lens: List[int]):
@@ -1269,6 +1345,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         cross_attention_states = cross_attention_states_flat
         return cross_attention_states
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_cross_attention_states(
         self,
         image_inputs: MllamaImagePixelInputs,
@@ -1374,20 +1453,14 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             skip_cross_attention = False
 
-            # Get the actual number of encoder tokens for each sample.
-            # Because attn_metadata.encoder_seq_lens only counts the last
-            # group of images for each sample, which is used to cheat the
-            # block manager to allocate blocks for those images only.
-            # See input_processor_for_mllama() for more details.
-            num_tiles_tensor = kwargs.pop("num_tiles")
-            num_tiles = [t.tolist() for t in num_tiles_tensor]
+            num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")]
             num_tokens_per_tile = calc_token_per_chunk(self.image_size)
-            actual_encoder_seq_lens = [
-                sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-            ]
-            for actual_len, last_group_len in zip(
-                    actual_encoder_seq_lens, attn_metadata.encoder_seq_lens):
-                assert actual_len >= last_group_len
+
+            actual_encoder_seq_lens = self._get_and_validate_encoder_lens(
+                attn_metadata.encoder_seq_lens,
+                num_tiles,
+                num_tokens_per_tile,
+            )
 
             cross_attention_states = self.get_cross_attention_states(
                 image_inputs, attn_metadata, actual_encoder_seq_lens)
@@ -1465,6 +1538,15 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                 updated_params.add(name)
         return updated_params
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_model")
+
 
 def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
     for mask in sparse_mask:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
new file mode 100644
index 0000000000000..69e3ea8bd0637
--- /dev/null
+++ b/vllm/model_executor/models/mllama4.py
@@ -0,0 +1,851 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Iterable, Mapping
+from functools import cached_property
+from itertools import tee
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, Llama4Config, Llama4VisionConfig
+from transformers.image_utils import SizeDict
+from transformers.models.llama4 import Llama4Processor
+from transformers.models.llama4.image_processing_llama4_fast import (
+    find_supported_resolutions, get_best_fit)
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.loader import _initialize_model
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llama4 import Llama4ForCausalLM
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class Llama4ImagePatchInputs(TypedDict):
+    type: Literal["pixel_values"]
+    flat_data: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_chunks, num_channels, image size, image size)`
+    """
+    patches_per_image: torch.Tensor
+    """
+    The number of total patches for each image in the batch.
+
+    This is used to split the embeddings which has the first two dimensions
+    flattened just like `flat_data`.
+    """
+
+    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A list of aspect ratios corresponding to the number of tiles
+    in each dimension that each image in the batch corresponds to.
+
+    Shape:
+    `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)`
+    """
+
+
+class Llama4VisionMLP(nn.Module):
+
+    def __init__(self,
+                 input_size: int,
+                 intermediate_size: int,
+                 output_size: int,
+                 bias: bool,
+                 output_activation: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            input_size=input_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=output_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.activation_fn = nn.GELU()
+        self.output_activation = output_activation
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        if self.output_activation:
+            return self.activation_fn(hidden_states)
+        return hidden_states
+
+
+class Llama4MultiModalProjector(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.linear_1 = ColumnParallelLinear(
+            input_size=config.vision_config.vision_output_dim,
+            output_size=config.text_config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=f"{prefix}.linear_1",
+        )
+
+    def forward(self, image_features):
+        hidden_states, _ = self.linear_1(image_features)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(batch_size, height,
+                                        int(width * shuffle_ratio),
+                                        int(channels / shuffle_ratio))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(batch_size,
+                                           int(height * shuffle_ratio),
+                                           int(width * shuffle_ratio),
+                                           int(channels / (shuffle_ratio**2)))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1,
+                                         reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.inner_dim = int(config.projector_input_dim //
+                             (self.pixel_shuffle_ratio**2))
+        self.output_dim = config.projector_output_dim
+        self.mlp = Llama4VisionMLP(
+            input_size=config.intermediate_size,
+            intermediate_size=config.projector_input_dim,
+            output_size=config.projector_output_dim,
+            bias=config.multi_modal_projector_bias,
+            output_activation=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp")
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches,
+                                        self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+class Llama4VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // self.num_heads
+        assert self.num_heads % self.tp_size == 0
+        self.num_local_heads = self.num_heads // self.tp_size
+        self.q_size = self.num_local_heads * self.head_dim
+        self.kv_size = self.num_local_heads * self.head_dim
+        self.attention_dropout = config.attention_dropout
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
+                                       self.scaling)
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.embed_dim,
+            bias=True,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=config.hidden_size // config.num_attention_heads // 2,
+            # number of image patches
+            max_position=(config.image_size // config.patch_size)**2,
+            base=config.rope_theta,
+            rope_scaling={"rope_type": "mllama4"},
+            is_neox_style=False,
+            dtype=torch.complex64,  # important
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_shape = hidden_states.shape[:-1]
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.view(q.shape[0], q.shape[1], self.num_local_heads, self.head_dim)
+        k = k.view(k.shape[0], k.shape[1], self.num_local_heads, self.head_dim)
+        q, k = self.rotary_emb(q, k)
+
+        q = q.view(q.shape[0], q.shape[1], -1)
+        k = k.view(k.shape[0], k.shape[1], -1)
+
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output, _ = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = Llama4VisionAttention(config,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.self_attn")
+        self.mlp = Llama4VisionMLP(input_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   output_size=config.hidden_size,
+                                   bias=True,
+                                   output_activation=False,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state)
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = (hidden_state, )
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Llama4VisionEncoderLayer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{layer_idx}",
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                    `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you
+                want more control over how to convert `input_ids` indices into
+                associated vectors than the model's internal embedding
+                lookup matrix.
+        """
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs[0]
+
+        return hidden_states
+
+
+class Llama4UnfoldConvolution(nn.Module):
+
+    def __init__(self,
+                 config: Llama4VisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
+                                      stride=config.patch_size)
+        self.linear = ColumnParallelLinear(config.num_channels *
+                                           kernel_size[0] * kernel_size[1],
+                                           config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           gather_output=True,
+                                           prefix=f"{prefix}.linear")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states, _ = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.patch_embedding")
+
+        self.class_embedding = nn.Parameter(self.scale *
+                                            torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(
+            self.scale * torch.randn(self.num_patches, self.hidden_size))
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
+
+        # encoders
+        self.model = Llama4VisionEncoder(config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.model")
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(
+            config, quant_config, prefix=f"{prefix}.vision_adapter")
+
+    def forward(
+        self,
+        images_flattened: torch.Tensor,
+    ) -> torch.Tensor:
+        # Patch embedding
+        hidden_state = self.patch_embedding(images_flattened)
+        num_tiles, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1,
+                                                      hidden_state.shape[-1])
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            num_tiles,
+            1,
+            num_patches,
+            hidden_dim,
+        )
+        positional_embedding = self.positional_embedding_vlm.to(
+            dtype=hidden_state.dtype, device=hidden_state.device)
+        hidden_state = hidden_state + positional_embedding
+        hidden_state = self.layernorm_pre(hidden_state)
+        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
+
+        # Apply encoder
+        hidden_state = self.model(hidden_state)
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Remove CLS token output
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        return hidden_state
+
+
+class Mllama4ProcessingInfo(BaseProcessingInfo):
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__(ctx)
+
+    def get_hf_config(self) -> Llama4Config:
+        return self.ctx.get_hf_config(Llama4Config)
+
+    def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
+        return self.ctx.get_hf_processor(Llama4Processor,
+                                         use_fast=True,
+                                         **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        # Although vLLM can support more images from an infra capability
+        # perspective, we do not recommend using >10 images in practice.
+        return {"image": None}
+
+    @staticmethod
+    def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int:
+        image_size = vision_config.image_size
+        patch_size = vision_config.patch_size
+
+        assert (
+            image_size %
+            patch_size == 0), f"chunk size {image_size} should be multiple of "
+        f"patch_size {patch_size}"
+
+        ds_ratio = int(round(1.0 / (vision_config.pixel_shuffle_ratio**2)))
+        return (image_size // patch_size)**2 // ds_ratio
+
+    def get_max_num_tiles(self) -> int:
+        image_processor = self.get_hf_processor().image_processor
+        return image_processor.max_patches
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_config = self.get_hf_config().vision_config
+        image_size = vision_config.image_size
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=self.get_max_num_tiles() * image_size,
+                         width=image_size)
+
+
+class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
+                                 ):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+
+        if mm_data is None:
+            return tokenizer(prompt, add_special_tokens=False)  # exclude bos
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = processor.image_processor
+        vision_config = self.info.get_hf_config().vision_config
+
+        if processed_outputs.get("pixel_values") is not None:
+            assert "images" in mm_data, \
+                "images expected to be in mm_data when pixel_values is present"
+
+            images = mm_data["images"]
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+
+            tile_size = vision_config.image_size
+            possible_resolutions = find_supported_resolutions(
+                max_num_chunks=self.info.get_max_num_tiles(),
+                patch_size=SizeDict(height=tile_size, width=tile_size),
+            )
+            best_fit_sizes = [
+                get_best_fit(
+                    (image.size[1], image.size[0]),
+                    torch.tensor(possible_resolutions),
+                    resize_to_max_canvas=image_processor.resize_to_max_canvas)
+                for image in parsed_images
+            ]
+            # TODO tile height/width do not necessarily need to match
+            aspect_ratios = [(image_size[0] // tile_size,
+                              image_size[1] // tile_size)
+                             for image_size in best_fit_sizes]
+            patches_per_image = [
+                1 if r_h * r_w == 1 else 1 + r_h * r_w
+                for (r_h, r_w) in aspect_ratios
+            ]
+
+            processed_outputs["aspect_ratios"] = aspect_ratios
+            processed_outputs["patches_per_image"] = torch.tensor(
+                patches_per_image)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        patches_per_image = hf_inputs.get("patches_per_image", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", patches_per_image),
+            patches_per_image=MultiModalFieldConfig.batched("image"),
+            aspect_ratios=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> List[PromptUpdate]:
+        assert (
+            mm_items.get_count("image", strict=False) == 0
+            or "aspect_ratios" in out_mm_kwargs
+        ), "Transformers expect to include aspect_ratios in out_mm_kwargs"
+
+        config = self.info.get_hf_config()
+        vision_config = config.vision_config
+
+        num_patches_per_chunk = self.info.get_patch_per_chunk(vision_config)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
+
+        def get_replacement(item_idx: int):
+            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
+
+            repl = hf_processor._prompt_split_image(
+                aspect_ratio=aspect_ratio,
+                num_patches_per_chunk=num_patches_per_chunk,
+            )
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.fake_image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        (target_width,
+         target_height) = self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Mllama4MultiModalProcessor,
+    info=Mllama4ProcessingInfo,
+    dummy_inputs=Mllama4DummyInputsBuilder,
+)
+class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.vision_model = Llama4VisionModel(config.vision_config,
+                                              None,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_model"))
+        self.multi_modal_projector = Llama4MultiModalProjector(
+            self.config,
+            None,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = _initialize_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config,
+                                                   ["LlamaForCausalLM"]),
+            prefix=maybe_prefix(prefix, "language_model"),
+            model_class=Llama4ForCausalLM,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
+        # num_images, 1, num_chunks, channel, image_size, image_size
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is None:
+            return None
+
+        # num_images x num_chunks, channel, image_size, image_size
+        # TODO: confirm handling for variable lengths
+        flat_pixel_values = flatten_bn(pixel_values, concat=True)
+        patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
+
+        aspect_ratios = kwargs.pop("aspect_ratios", None)
+        if not isinstance(aspect_ratios, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of aspect_ratios. "
+                             f"Got type: {type(aspect_ratios)}")
+
+        return Llama4ImagePatchInputs(
+            type="pixel_values",
+            flat_data=flat_pixel_values,
+            patches_per_image=patches_per_image,
+            aspect_ratios=aspect_ratios,
+        )
+
+    def _process_image_input(
+            self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
+        flat_data = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"].tolist()
+
+        vision_embeddings_flat = self.vision_model(flat_data)
+        vision_embeddings_flat = self.multi_modal_projector(
+            vision_embeddings_flat)
+
+        return [
+            img.flatten(0, 1)
+            for img in vision_embeddings_flat.split(patches_per_image, dim=0)
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner,
+        # this condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        return self.language_model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def separate_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        prefix: str,
+    ) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[
+            str, torch.Tensor]]]:
+        weights1, weights2 = tee(weights, 2)
+
+        def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]:
+            for name, data in weights1:
+                if name.startswith(prefix):
+                    yield (name, data)
+
+        def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]:
+            for name, data in weights2:
+                if not name.startswith(prefix):
+                    yield (name, data)
+
+        return get_prefix_weights(), get_other_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: Set[str] = set()
+
+        # language_model is an Llama4ForCausalLM instance. We load it's
+        # using llama4's load_weights routine.
+        language_model_weights, other_weights = self.separate_weights(
+            weights, prefix="language_model.")
+        loader = AutoWeightsLoader(self)
+        loaded_language_model_params = loader.load_weights(
+            language_model_weights)
+        assert loaded_language_model_params is not None
+        updated_params.update(loaded_language_model_params)
+
+        for name, loaded_weight in other_weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+
+                weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
new file mode 100644
index 0000000000000..2190241f0ba3c
--- /dev/null
+++ b/vllm/model_executor/models/modernbert.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import ModernBertConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
+
+from .interfaces import SupportsCrossEncoding
+from .utils import WeightsMapper, maybe_prefix
+
+
+class ModernBertEmbeddings(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                     config.hidden_size)
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.layer_norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds:
+            return self.norm(inputs_embeds)
+        else:
+            inputs_embeds = self.tok_embeddings(input_ids)
+            embeddings = self.norm(inputs_embeds)
+            return embeddings
+
+
+class ModernBertRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(self, config: ModernBertConfig, head_size: int, dim: int,
+                 base: float):
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=base,
+            is_neox_style=True,
+            dtype=torch.float16)
+        self.config = config
+
+
+class ModernBertAttention(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.deterministic_flash_attn = config.deterministic_flash_attn
+        self.num_heads = config.num_attention_heads
+        assert self.num_heads % tp_size == 0
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.head_dim * self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.Wqkv = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            bias=config.attention_bias,
+        )
+
+        if layer_id % config.global_attn_every_n_layers != 0:
+            self.local_attention = (config.local_attention // 2,
+                                    config.local_attention // 2)
+        else:
+            self.local_attention = (-1, -1)
+
+        rope_theta = config.global_rope_theta
+        if self.local_attention != (
+                -1, -1) and config.local_rope_theta is not None:
+            rope_theta = config.local_rope_theta
+        self.rotary_emb = ModernBertRotaryEmbedding(config=config,
+                                                    head_size=self.head_dim,
+                                                    dim=self.head_dim,
+                                                    base=rope_theta)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              prefix=f"{layer_id}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+        self.Wo = RowParallelLinear(config.hidden_size,
+                                    config.hidden_size,
+                                    bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_outputs = self.attn(q, k, v)
+        hidden_states = attn_outputs
+        hidden_states, _ = self.Wo(hidden_states)
+        return hidden_states
+
+
+class ModernBertMLP(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.config = config
+        self.Wi = nn.Linear(config.hidden_size,
+                            int(config.intermediate_size) * 2,
+                            bias=config.mlp_bias)
+        self.act = nn.GELU()
+        self.Wo = RowParallelLinear(config.intermediate_size,
+                                    config.hidden_size,
+                                    bias=config.mlp_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
+        return self.Wo(self.act(input) * gate)[0]
+
+
+class ModernBertLayer(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 prefix: str = "",
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        if layer_id == 0:
+            self.attn_norm = nn.Identity()
+        else:
+            self.attn_norm = nn.LayerNorm(config.hidden_size,
+                                          eps=config.norm_eps,
+                                          bias=config.norm_bias)
+        self.attn = ModernBertAttention(config=config, layer_id=layer_id)
+        self.mlp_norm = nn.LayerNorm(config.hidden_size,
+                                     eps=config.norm_eps,
+                                     bias=config.norm_bias)
+        self.mlp = ModernBertMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        attn_outputs = self.attn(self.attn_norm(hidden_states),
+                                 position_ids=position_ids)
+        hidden_states = hidden_states + attn_outputs
+        mlp_output = self.mlp(self.mlp_norm(hidden_states))
+        hidden_states = hidden_states + mlp_output
+        return hidden_states
+
+
+class ModernBertEncoderLayer(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layers = nn.ModuleList([
+            ModernBertLayer(config=config, layer_id=layer_id)
+            for layer_id in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, position_ids)
+        return hidden_states
+
+
+@support_torch_compile
+class ModernBertModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"layers.": "encoder_layer.layers."})
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.embeddings = ModernBertEmbeddings(config)
+        self.encoder_layer = ModernBertEncoderLayer(vllm_config)
+        self.final_norm = nn.LayerNorm(config.hidden_size,
+                                       eps=config.norm_eps,
+                                       bias=config.norm_bias)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            inputs_embeds=inputs_embeds)
+
+        outputs = self.encoder_layer(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+        )
+        norm_outputs = self.final_norm(outputs)
+        return norm_outputs
+
+
+class ModernBertPooler(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size,
+                               config.classifier_bias)
+        self.act = nn.GELU()
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        pooled_output = hidden_states
+        pooled_output = pooled_output.mean(dim=0, keepdim=False)
+        pooled_output = self.norm(self.act(self.dense(pooled_output)))
+        return pooled_output
+
+
+class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.model = ModernBertModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "modernbert"))
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self._pooler = CrossEncodingPooler(config, self.classifier,
+                                           ModernBertPooler(config))
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("model."):
+                    yield name[len("model."):], weight
+                else:
+                    self_weights.append((name, weight))
+
+        self.model.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            if name.startswith("head"):
+                param = params_dict["_pooler.pooler." + name[len("head") + 1:]]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            position_ids=positions,
+        )
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3f0c644a5a866..d75845b45e733 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -41,15 +41,16 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -57,7 +58,6 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -71,30 +71,22 @@ POOLING_SIZE = 2
 
 
 class MolmoImageInputs(TypedDict):
-    images: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
+    images: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
 
-    image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]]
-    """Shape: `(batch_size, num_crops, num_patch)`"""
+    image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
+    """Shape: `(batch_size * num_images, num_crops, num_patch)`"""
 
-    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_crops, num_patch)`
-    """
-
-    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_embeds)`
+    Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
     num_crops: torch.Tensor
-    """Shape: `(batch_size, num_images)`"""
+    """Shape: `(batch_size * num_images)`"""
 
 
 @dataclass
@@ -696,9 +688,10 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         return image_features
 
     def forward(
-        self, images: torch.Tensor, image_masks: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+        self,
+        images: torch.Tensor,
+        image_masks: torch.Tensor,
+    ) -> torch.Tensor:
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501
         batch_size, num_image = images.shape[:2]
         images = images.to(device=self.device, dtype=self.dtype)
@@ -1144,26 +1137,7 @@ class MolmoProcessorWrapper:
 
         image_input_idx = outputs.pop("image_input_idx", None)
         if image_input_idx is not None:
-            input_is_patch = input_ids == self.image_patch_id
-            image_input_idx_flat: torch.Tensor = image_input_idx.view(-1)
-            image_valid_flat = image_input_idx_flat >= 0
-            feat_is_patch_flat = image_valid_flat.clone()
-            feat_is_patch_flat[image_valid_flat] = (
-                input_is_patch[image_input_idx_flat[image_valid_flat]])
-            feat_is_patch = feat_is_patch_flat.view(*image_input_idx.shape)
-
-            input_is_embed = torch.isin(
-                input_ids,
-                torch.tensor([
-                    self.image_patch_id,
-                    self.im_col_id,
-                    self.im_start_id,
-                    self.im_end_id,
-                ]),
-            )
-            embed_ids = input_ids[input_is_embed]
-            embed_is_patch = embed_ids == self.image_patch_id
-            assert embed_is_patch.sum() == feat_is_patch.sum()
+            feat_is_patch = image_input_idx >= 0
 
             tilings = [
                 self.select_tiling(
@@ -1176,11 +1150,10 @@ class MolmoProcessorWrapper:
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
-            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
-        return BatchFeature(outputs, tensor_type=return_tensors)
+        return BatchFeature(outputs)
 
 
 class MolmoProcessingInfo(BaseProcessingInfo):
@@ -1190,16 +1163,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         return MolmoProcessorWrapper(processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        # TODO: Investigate different `embed_is_patch` between cache/no-cache
-        # in multi-image case
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
+        return {"image": None}
 
     def get_num_image_tokens(
         self,
@@ -1217,26 +1181,13 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         )
         pooling_size = processor.pooling_size
 
-        base_image_input_size = processor.base_image_input_size
-        base_image_input_d = processor.image_patch_size
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
 
-        crop_patches = base_image_input_size[0] // base_image_input_d
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
 
-        per_row = ncols // pooling_size + 1
-        joint = per_row * (nrows // pooling_size) + 2
-        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-        resize = (image_token_length + 1) * image_token_length + 2
-
-        return resize + joint
-
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
+        return extra + joint
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -1266,27 +1217,25 @@ class MolmoProcessingInfo(BaseProcessingInfo):
 
 class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
 
@@ -1325,7 +1274,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.shared("image", num_images),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1365,8 +1313,10 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            image_tokens = extra_joint + joint
-            return image_tokens
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )
 
         return [
             PromptInsertion(
@@ -1472,11 +1422,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1488,93 +1433,46 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
+        num_crops = flatten_bn(num_crops, concat=True)
+
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
-            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
-        if isinstance(image_input["images"], list):
-            # Call the vision backbone on the whole batch at once
-            images_flat = flatten_bn(image_input["images"], concat=True)
-            image_masks_flat = (None if (image_masks :=
-                                         image_input["image_masks"]) is None
-                                else flatten_bn(image_masks, concat=True))
-
-            image_features_flat = self.vision_backbone(
-                images=images_flat.unsqueeze(0),
-                image_masks=(None if image_masks_flat is None else
-                             image_masks_flat.unsqueeze(0)),
-            ).squeeze(0)
-
-            # Reconstruct the batch dimension
-            image_features = image_features_flat.split(
-                image_input["num_crops"].sum(-1).tolist())
-        else:
-            image_features = self.vision_backbone(
-                images=image_input["images"],
-                image_masks=image_input["image_masks"],
-            )
-
-        return image_features
-
-    def _get_mm_embeds(
-            self,
-            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
-            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
-            num_crops: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
     ) -> list[torch.Tensor]:
-        """
-        Scatter the patch features into a contiguous tensor that corresponds
-        to the embedding tokens defined by the multimodal processor.
+        images = image_input["images"]
+        image_masks = image_input["image_masks"]
+        feat_is_patch = image_input["feat_is_patch"]
+        num_crops = image_input["num_crops"]
 
-        Note:
-            The original code only considers patch tokens as feature
-            tokens, but our processor considers all image-related tokens
-            as feature tokens because the feature tokens need to be
-            consecutive in `input_ids`.
-        
-        Example:
-            A simplified example for one item in the batch:
+        # Call the vision backbone on the whole batch at once
+        images_flat = flatten_bn(images, concat=True)
+        image_masks_flat = (None if image_masks is None else flatten_bn(
+            image_masks, concat=True))
+        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
 
-            .. code-block::
+        image_features_flat = self.vision_backbone(
+            images=images_flat.unsqueeze(0),
+            image_masks=(None if image_masks_flat is None else
+                         image_masks_flat.unsqueeze(0)),
+        ).squeeze(0)
 
-                Embedding tokens (from HF processor):
-                [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+        # Only the features corresponding to patch tokens are relevant
+        return [
+            feats[f_is_patch] for feats, f_is_patch in zip(
+                image_features_flat.split(num_crops.tolist()),
+                feat_is_patch_flat.split(num_crops.tolist()),
+            )
+        ]
 
-                embed_is_patch (from HF processor):
-                [ False   True    True    False    True    True   False  False ]
-    
-                Encoder outputs (from model):
-                        [  p1      p2       0       p3      p4      0   ]
-
-                feat_is_patch (from HF processor):
-                        [ True    True    False    True    True   False ]
-
-                The resulting embedding tensor is:
-                [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-        """
-        num_crops_per_image = num_crops.tolist()
-        feats_per_image = features.split(num_crops_per_image)
-        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
-
-        _, _, embed_dim = features.shape
-        (num_embeds, ) = embed_is_patch.shape
-
-        embeds_in_batch = list[torch.Tensor]()
-        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
-            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
-            embeds[embed_is_patch] = feats[f_is_patch]
-            embeds_in_batch.append(embeds)
-
-        return embeds_in_batch
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -1582,15 +1480,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return flatten_2d_lists(
-            self._get_mm_embeds(*args) for args in zip(
-                image_features,
-                image_input["feat_is_patch"],
-                image_input["num_crops"],
-                image_input["embed_is_patch"],
-            ))
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -1604,7 +1494,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
new file mode 100644
index 0000000000000..c367d90f847b6
--- /dev/null
+++ b/vllm/model_executor/models/moonvit.py
@@ -0,0 +1,628 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# This file is meant to be used in kimi_vl.py only
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+from copy import deepcopy
+from functools import cached_property
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN, PytorchGELUTanh
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
+
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None
+
+
+def multihead_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """Multi-head attention using flash attention 2.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[
+        0], "q_cu_seqlens must sum to q.shape[0]"
+    assert (k_cu_seqlens[-1] == k.shape[0] ==
+            v.shape[0]), "k_cu_seqlens must sum to k.shape[0]"
+    assert q.dtype in [
+        torch.bfloat16,
+        torch.float16,
+    ], f"unsupported dtype {q.dtype} for multihead attn"
+
+    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
+    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
+    attn_out = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        q_cu_seqlens,
+        k_cu_seqlens,
+        max_seqlen_q,
+        max_seqlen_k,
+        causal=False,
+    )
+    attn_out = attn_out.flatten(start_dim=-2)
+
+    return attn_out
+
+
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """SDPA attention.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+    """
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros([1, seq_length, seq_length],
+                                 device=q.device,
+                                 dtype=torch.bool)
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q,
+                                                 k,
+                                                 v,
+                                                 attention_mask,
+                                                 dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
+
+
+VL_VISION_ATTENTION_FUNCTIONS = {
+    "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
+}
+
+
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+
+
+def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
+               freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Learnable2DInterpPosEmb(nn.Module):
+
+    def __init__(self,
+                 height: int,
+                 width: int,
+                 dim: int,
+                 interpolation_mode: str = "bicubic") -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for shape in grid_hws.tolist():
+            if shape == self.weight.shape[:-1]:
+                pos_embs.append(self.weight.flatten(end_dim=1))
+            else:
+                pos_embs.append(
+                    F.interpolate(
+                        self.weight.permute((2, 0, 1)).unsqueeze(0),
+                        size=shape,
+                        mode=self.interpolation_mode,
+                    ).squeeze(0).permute((1, 2, 0)).flatten(end_dim=1))
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class MoonVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: Union[int, Tuple[int, int]] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size,
+            (int, Sequence)), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (len(patch_size) == 2
+                ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_dim,
+                              out_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+        self.pos_emb = Learnable2DInterpPosEmb(height=pos_emb_height,
+                                               width=pos_emb_width,
+                                               dim=out_dim)
+
+    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hw (N, 2): grid height and width
+
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_hw)
+        return x
+
+
+class Rope2DPosEmb(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+
+    def __init__(self,
+                 dim: int,
+                 max_height: int,
+                 max_width: int,
+                 theta_base=10000,
+                 device="cuda"):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.device = device
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    @cached_property
+    def precomputed_freqs_cis(self) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(self.device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (torch.arange(0, self.dim,
+                                  4)[:(self.dim // 4)].float().to(self.device)
+                     )  # C/4
+        freqs = 1.0 / (self.theta_base**(dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1),
+             y_cis.unsqueeze(dim=-1)], dim=-1)
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        shapes = grid_hws.tolist()
+        assert all(1 <= h <= self.max_height and 1 <= w <= self.max_width
+                   for h, w in shapes), (
+                       shapes,
+                       self.max_height,
+                       self.max_width,
+                   )
+        freqs_cis = torch.cat(
+            [
+                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
+                for h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+    def get_freqs_cis_by_idx(self, pos_idx: torch.Tensor,
+                             pos_idx_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
+            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
+                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
+        Return:
+            freqs_cis: tensor of shape (..., dim//2)
+        """
+        assert (pos_idx.shape[:-1] == pos_idx_mask.shape
+                and pos_idx.shape[-1] == 2 and pos_idx.ndim
+                == pos_idx_mask.ndim + 1), (pos_idx.shape, pos_idx_mask.shape)
+        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
+
+        shp = pos_idx_mask.shape + (self.dim // 2, )  # ..., head_dim/2
+        freqs_cis = torch.ones(shp, dtype=torch.complex64,
+                               device=self.device)  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[pos_idx[
+            ..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]]
+        return freqs_cis
+
+
+class MLP2(nn.Module):
+    """
+    Args:
+        dims: [in_dim, hidden_dim, out_dim]
+        bias: whether to use bias in linear layer.
+    """
+
+    def __init__(self, dims: list[int], activation, bias=True):
+        super().__init__()
+        assert len(dims) == 3
+        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
+        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.activation = activation
+        for m in [self.fc0, self.fc1]:
+            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc0(x)
+        x = self.activation(x)
+        return self.fc1(x)
+
+
+class MoonVitEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        attn_implementation: str = "sdpa",
+        activation=F.gelu,
+        attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+        self.attn_implementation = attn_implementation
+        # use fa2 in vllm by default
+        if is_flash_attn_2_available():
+            self.attn_implementation = "flash_attention_2"
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
+        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+
+    def attention_qkvpacked(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            cu_seqlens (torch.Tensor):
+        """
+        xqkv = self.wqkv(x)
+
+        qkv_shape = xqkv.size()[:-1] + (
+            3,
+            self.num_heads,
+            self.hidden_size_per_attention_head,
+        )
+        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
+        xqkv = xqkv.view(*qkv_shape)
+        xq, xk, xv = torch.unbind(xqkv, dim=-3)
+
+        xq, xk = apply_rope(xq, xk, rope_freqs_cis)
+
+        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
+        attn_out = attn_func(xq,
+                             xk,
+                             xv,
+                             q_cu_seqlens=cu_seqlens,
+                             k_cu_seqlens=cu_seqlens)
+
+        attn_out = self.wo(attn_out)
+        return attn_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Union[torch.Tensor, None] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
+
+        Returns:
+            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
+        """
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+        attn_out = self.attention_qkvpacked(hidden_states,
+                                            cu_seqlens,
+                                            rope_freqs_cis=rope_freqs_cis)
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.mlp(self.norm1(hidden_states))
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MoonVitEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+    ) -> None:
+        super().__init__()
+
+        self.rope_2d = Rope2DPosEmb(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
+        self.blocks = nn.ModuleList(
+            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)])
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
+            grid_hws=grid_hw)
+
+        lengths = torch.cat((
+            torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+            grid_hw[:, 0] * grid_hw[:, 1],
+        ))
+        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
+
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(hidden_states,
+                                  cu_seqlens,
+                                  rope_freqs_cis=rope_freqs_cis)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+def patch_merger(
+        x: torch.Tensor,
+        grid_hw: torch.Tensor,
+        merge_kernel_size: list[int, int] = (2, 2),
+) -> List[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for x_shape in grid_hw.tolist():
+        height, width = x_shape[0], x_shape[1]
+        # Get the current sequence
+        seq = x[pre_sum:pre_sum + height * width]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = height // kernel_height, width // kernel_width
+        reshaped_seq = seq.view(new_height, kernel_height, new_width,
+                                kernel_width, d_model)
+        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
+        padded_seq = reshaped_seq.view(new_height * new_width,
+                                       kernel_height * kernel_width, -1)
+        outputs.append(padded_seq)
+        pre_sum += height * width
+
+    return outputs
+
+
+class MoonVitVLProjector(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        merge_kernel_size: list[int, int],
+        hidden_act: str = "gelu",
+        ln_eps: float = 1e-5,
+        out_dim: int = 4096,
+    ):
+        super().__init__()
+        self.hidden_size = in_channels * merge_kernel_size[
+            0] * merge_kernel_size[1]
+
+        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = ACT2FN[hidden_act]
+        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MoonVitPretrainedModel(PreTrainedModel):
+    config_class = MoonViTConfig
+    model_type = "moonvit"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config = deepcopy(config)
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.patch_embed = MoonVisionPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+        )
+
+        self.encoder = MoonVitEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": PytorchGELUTanh(),
+                "attn_bias": True,
+                "attn_implementation": config._attn_implementation,
+            },
+        )
+
+    def forward(self, pixel_values: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_hw (torch.Tensor): The grid height and width.
+
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        hidden_states = self.patch_embed(pixel_values, grid_hw)
+        hidden_states = self.encoder(hidden_states, grid_hw)
+        hidden_states = patch_merger(hidden_states,
+                                     grid_hw,
+                                     merge_kernel_size=self.merge_kernel_size)
+        return hidden_states
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index d716818f31c03..b30f3ee37997f 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -27,7 +27,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -266,6 +266,23 @@ class MPTModel(nn.Module):
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MPTForCausalLM(nn.Module, SupportsPP):
 
@@ -318,17 +335,5 @@ class MPTForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index a2b4949496897..0ea296b2f93d1 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -63,8 +63,8 @@ def _cast_if_autocast_enabled(*args):
     if not torch.is_autocast_enabled():
         return args
     else:
-        return torch.cuda.amp.autocast_mode._cast(
-            args, torch.get_autocast_gpu_dtype())
+        return torch.amp.autocast_mode._cast(
+            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype())
 
 
 class NemotronLayerNorm1P(nn.LayerNorm):
@@ -89,7 +89,7 @@ class NemotronLayerNorm1P(nn.LayerNorm):
             residual = x
         args = _cast_if_autocast_enabled(x, self.normalized_shape,
                                          self.weight + 1, self.bias, self.eps)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             x = torch.nn.functional.layer_norm(*args)
             return x if residual is None else (x, residual)
 
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
new file mode 100644
index 0000000000000..5c9b04cab180a
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -0,0 +1,454 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only deci model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (config.num_attention_heads //
+                            block_config.attention.n_heads_in_group)
+            self.self_attn = LlamaAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size)
+
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+            self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                    eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if (residual is None):
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class DeciModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return layer_type(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return DeciModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 9d04f30c8f3fe..62a7deab6a10c 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -15,12 +15,11 @@ from transformers import PretrainedConfig
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import ProcessorInputs
 
 from .intern_vit import InternVisionModel
 from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
@@ -57,7 +56,7 @@ class NVLMProcessor(BaseInternVLProcessor):
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,57 +83,32 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer
-
-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
-
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        # The newline is necessary to separate ">" of the current item
+        # and "<" of the next item
+        return "<image>\n" * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            # The newline is necessary to separate ">" of the current item
-            # and "<" of the next item
-            prompt_text="<image>\n" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
@@ -177,10 +151,7 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 6cf3f1f82645c..296bac51d0125 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -39,7 +39,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -255,7 +255,7 @@ class OlmoeModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -308,56 +308,6 @@ class OlmoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class OlmoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = OlmoeModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -380,8 +330,6 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -453,3 +401,59 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class OlmoeForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["rotary_emb.inv_freq"],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d4c2b4c48d908..14aae2fbf817f 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +313,43 @@ class OPTModel(nn.Module):
                             intermediate_tensors,
                             inputs_embeds=inputs_embeds)
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OPTForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -320,11 +357,14 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "decoder.": "model.decoder.",
+    })
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        super().__init__()
         self.config = config
         self.quant_config = quant_config
         self.model = OPTModel(vllm_config=vllm_config,
@@ -372,42 +412,9 @@ class OPTForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            if name.startswith("decoder."):
-                name = "model." + name
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0b42666e02d61..5d35234544c30 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -30,7 +30,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -260,6 +260,45 @@ class OrionModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OrionForCausalLM(nn.Module, SupportsPP):
 
@@ -314,46 +353,14 @@ class OrionForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=([
+                "rotary_emb.inv_freq",
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+                "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6fedb8c819849..6c1bd499f6398 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,12 +13,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargs)
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -72,44 +73,44 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
+    def get_num_image_tokens(
         self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
-    def get_num_image_tokens(self) -> int:
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
         vision_encoder_info = self.get_vision_encoder_info()
-        return vision_encoder_info.get_max_image_tokens()
+
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
 
 
 class PaliGemmaDummyInputsBuilder(
         BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
         max_image_size = vision_config.image_size
 
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class PaliGemmaMultiModalProcessor(
         BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
@@ -148,12 +149,30 @@ class PaliGemmaMultiModalProcessor(
         image_token_id = hf_config.image_token_index
 
         tokenizer = self.info.get_tokenizer()
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens
 
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
+        def get_insertion(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=image_token_id,
+            )
+
         # Paligemma 1 and 2 have different tokenizer.add_bos_token
         # Insert <image>*n + <bos> after <bos> for Paligemma 1
         # Insert <image>*n + <bos> for Paligemma 2
@@ -162,10 +181,7 @@ class PaliGemmaMultiModalProcessor(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
-                ),
+                insertion=get_insertion,
             )
         ]
 
@@ -323,6 +339,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return self.multi_modal_projector(image_features)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index db8d170a8c91b..15afea82f3a0e 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -46,7 +46,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -221,7 +221,7 @@ class PersimmonModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -260,6 +260,38 @@ class PersimmonModel(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
 
@@ -315,39 +347,5 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # copy from vllm/model_executor/models/bloom.py
-                # NOTE: Persimmon's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 6ee80210c2b4d..fdf7734595a54 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -61,7 +61,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -249,6 +249,49 @@ class PhiModel(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v")
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # pylint: disable=E1136
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -317,43 +360,5 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v")
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # pylint: disable=E1136
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 33984f54ae271..7b02c9edfad21 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -26,7 +26,7 @@ from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -353,10 +353,29 @@ class Phi3SmallModel(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Phi3SmallForCausalLM(nn.Module, SupportsPP):
     _tied_weights_keys = ["lm_head.weight"]
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={"rotary_emb.inv_freq": None})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -448,21 +467,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None))
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5305f1e03e1a1..5b43871b75910 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,6 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
@@ -31,7 +32,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -39,10 +41,9 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -321,27 +322,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_image_tokens = self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
-        return {"image": max_image_tokens}
-
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -358,31 +344,31 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
 
 class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return "".join(image_tokens[:num_images])
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens[:num_images]),
-            mm_data=mm_data,
-        )
-
 
 class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
@@ -428,10 +414,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self.info.get_tokenizer()
-        bos_token_id = tokenizer.bos_token_id
-        assert isinstance(bos_token_id, int)
-
         def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items(
                 "image", (ImageEmbeddingItems, ImageProcessorItems))
@@ -446,12 +428,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
-
-            return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
-                features=image_tokens,
-            )
+            return [_IMAGE_TOKEN_ID] * num_image_tokens
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -469,6 +446,40 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # align to hf behavior when there are images
+        if len(mm_item_counts):
+            tokenizer = self.info.get_tokenizer()
+            # to decode token_ids to the original text, we need to
+            # 1. remove the first bos token
+            # 2. remove space after each special token
+            #    introduced by the tokenizer
+            if len(token_ids) and token_ids[0] == tokenizer.bos_token_id:
+                token_ids = token_ids[1:]
+            text = tokenizer.decode(token_ids)
+            for special_tokens in tokenizer.special_tokens_map.values():
+                if isinstance(special_tokens, str):
+                    text = text.replace(f"{special_tokens} ", special_tokens)
+                elif isinstance(special_tokens, list):
+                    for special_token in special_tokens:
+                        text = text.replace(f"{special_token} ", special_token)
+            # perform hf behavior
+            # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
+            pattern = r"<\|image_\d+\|>"
+            prompt_chunks = [
+                tokenizer(chunk).input_ids
+                for chunk in re.split(pattern, text)
+            ]
+            image_tags = [
+                tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in re.findall(pattern, text)
+            ]
+            if len(prompt_chunks) > len(image_tags):
+                image_tags.append([])
+            token_ids = [
+                e for sublist in zip(prompt_chunks, image_tags)
+                for ele in sublist for e in ele
+            ]
+
         token_ids, text, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
             mm_prompt_updates=mm_prompt_updates,
@@ -486,6 +497,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
+                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
@@ -648,6 +660,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
 
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 3d4505d556e2c..03ca143f9c08e 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,41 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
-import re
-from functools import lru_cache
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
-import scipy.signal
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig
-from transformers.utils import logging
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          SequenceFeatureExtractor, SiglipVisionConfig)
 
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
-from vllm.inputs.data import TokenInputs, token_inputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -43,115 +44,19 @@ _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011
 
 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
-
-DYNAMIC_HD = 16
-AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
-IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
 
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
     'siglip-so400m-patch14-448': {
-        'dynamic_hd': 16,
         'vit_image_size': 448,
         'vit_patch_size': 14,
         'token_compression_factor': 2,
     },
 }
-logger = logging.get_logger(__name__)
-# This is a workaround to prevent text (user input) + audio + image
-# from being used in the same prompt.
-# It includes token ids for "/n" and tokens in added_tokens_decoder
-# from the tokenizer_confg.json file.
-NON_USER_INPUT_TOKENS = {
-    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
-    200023, 200024, 200025, 200026, 200027, 200028
-}
 
 
-def get_max_dummy_image(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-
-    max_side = vit_image_size * dynamic_hd_size
-    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
-    return dummy_image
-
-
-# image token length
-def get_max_phi4mm_image_tokens(ctx: InputContext):
-    dummy_image = get_max_dummy_image(ctx)
-
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
-                                                 vit_image_size,
-                                                 vit_patch_size,
-                                                 token_compression_factor)
-    return image_num_tokens
-
-
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def _find_target_aspect_ratio(image, image_size, max_num, min_num):
-    orig_width, orig_height = image.size
-
-    w_crop_num = math.ceil(orig_width / float(image_size))
-    h_crop_num = math.ceil(orig_height / float(image_size))
-    if w_crop_num * h_crop_num > max_num:
-        aspect_ratio = orig_width / orig_height
-
-        # calculate the existing image aspect ratio
-        target_ratios = set((i, j) for i in range(1, max_num + 1)
-                            for j in range(1, max_num + 1)
-                            if i * j <= max_num and i * j >= min_num)
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-        # calculate the target width and height
-        target_width = image_size * target_aspect_ratio[0]
-        target_height = image_size * target_aspect_ratio[1]
-        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
-    else:
-        target_width = image_size * w_crop_num
-        target_height = image_size * h_crop_num
-        target_aspect_ratio = (w_crop_num, h_crop_num)
-    return target_aspect_ratio, target_height, target_width
-
-
-def _get_padding_size(image, target_height, target_width):
-    orig_width, orig_height = image.size
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -164,181 +69,6 @@ def _get_padding_size(image, target_height, target_width):
     return padding_height, padding_width
 
 
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=12,
-                       image_size=384,
-                       mask_size=27):
-    target_aspect_ratio, target_height, target_width =\
-          _find_target_aspect_ratio(
-        image, image_size, max_num, min_num)
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-
-    # Calculate the ratio
-    orig_width, orig_height = image.size
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-    if ratio_width < ratio_height:
-        new_size = (target_width, int(orig_height * ratio_width))
-    else:
-        new_size = (int(orig_width * ratio_height), target_height)
-
-    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
-                                 int(mask_size * target_aspect_ratio[0])))
-    if padding_width >= 14:
-        attention_mask[:, -math.floor(padding_width / 14):] = 0
-    if padding_height >= 14:
-        attention_mask[-math.floor(padding_height / 14):, :] = 0
-    assert attention_mask.sum(
-    ) > 0, f'attention mask is empty {attention_mask}'
-
-    if min(new_size[1], target_height) < 10 or min(new_size[0],
-                                                   target_width) < 10:
-        raise ValueError(f'the aspect ratio is very extreme {new_size}')
-
-    image = T.functional.resize(
-        image,
-        [new_size[1], new_size[0]],
-    )
-
-    resized_img = T.functional.pad(image,
-                                   [0, 0, padding_width, padding_height],
-                                   fill=[255, 255, 255])
-
-    return resized_img, attention_mask
-
-
-def pad_to_max_num_crops(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if max_crops > B:
-        pad = torch.zeros(max_crops - B,
-                          3,
-                          H,
-                          W,
-                          dtype=images.dtype,
-                          device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
-
-
-def pad_mask_to_max_num_crops(masks, max_crops=5):
-    B, H, W = masks.shape
-    if max_crops > B:
-        pad = torch.ones(max_crops - B,
-                         H,
-                         W,
-                         dtype=masks.dtype,
-                         device=masks.device)
-        masks = torch.cat([masks, pad], dim=0)
-    return masks
-
-
-def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
-
-    # Basic settings.
-    img_processor = T.Compose([
-        T.ToTensor(),
-        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    # Dynamic HD
-    base_resolution = vit_resolution
-    images = [image.convert('RGB') for image in images]
-    # cover 384 and 448 resolution
-    mask_resolution = base_resolution // vit_patch_size
-    elems, image_attention_masks = [], []
-    for im in images:
-        elem, attention_mask = dynamic_preprocess(im,
-                                                  max_num=dynamic_hd_size,
-                                                  image_size=base_resolution,
-                                                  mask_size=mask_resolution)
-        elems.append(elem)
-        image_attention_masks.append(attention_mask)
-    hd_images = [img_processor(im) for im in elems]
-    global_image = [
-        torch.nn.functional.interpolate(
-            im.unsqueeze(0).float(),
-            size=(base_resolution, base_resolution),
-            mode='bicubic',
-        ).to(im.dtype) for im in hd_images
-    ]
-    shapes = [[im.size(1), im.size(2)] for im in hd_images]
-    mask_shapes = [[mask.size(0), mask.size(1)]
-                   for mask in image_attention_masks]
-    global_attention_mask = [
-        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
-    ]
-    hd_images_reshape = [
-        im.reshape(1, 3, h // base_resolution, base_resolution,
-                   w // base_resolution, base_resolution).permute(
-                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
-                                                 base_resolution).contiguous()
-        for im, (h, w) in zip(hd_images, shapes)
-    ]
-    attention_masks_reshape = [
-        mask.reshape(1, h // mask_resolution, mask_resolution,
-                     w // mask_resolution, mask_resolution).permute(
-                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
-                                                mask_resolution).contiguous()
-        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
-    ]
-    # NOTE token compression is hard coded here, and odd numbers seems to fail
-    downsample_attention_masks = [
-        mask[:, 0::2,
-             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
-                           mask_resolution // 2 + mask_resolution % 2,
-                           mask_resolution // 2 + mask_resolution % 2).permute(
-                               0, 1, 3, 2, 4)
-        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
-    ]
-    downsample_attention_masks = [
-        mask.reshape(mask.size(1) * mask.size(2),
-                     mask.size(3) * mask.size(4))
-        for mask in downsample_attention_masks
-    ]
-    # NOTE hard coded number of tokens
-    num_img_tokens = [
-        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
-        for mask in downsample_attention_masks
-    ]
-
-    hd_images_reshape = [
-        torch.cat([_global_image] + [_im], dim=0)
-        for _global_image, _im in zip(global_image, hd_images_reshape)
-    ]
-    hd_masks_reshape = [
-        torch.cat([_global_mask] + [_mask],
-                  dim=0) for _global_mask, _mask in zip(
-                      global_attention_mask, attention_masks_reshape)
-    ]
-    max_crops = max([img.size(0) for img in hd_images_reshape])
-    image_transformed = [
-        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
-    ]
-    image_transformed = torch.stack(image_transformed, dim=0)
-    mask_transformed = [
-        pad_mask_to_max_num_crops(mask, max_crops) \
-            for mask in hd_masks_reshape
-    ]
-    mask_transformed = torch.stack(mask_transformed, dim=0)
-
-    returned_input_image_embeds = image_transformed
-    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
-    returned_image_attention_mask = mask_transformed
-    returned_num_img_tokens = num_img_tokens
-
-    data = {
-        "pixel_values": returned_input_image_embeds,
-        "image_sizes": returned_image_sizes,
-        "image_attention_mask": returned_image_attention_mask,
-        "num_img_tokens": returned_num_img_tokens,
-    }
-    return data
-
-
 def get_navit_vision_model(layer_idx: int = -1, **kwargs):
     vision_config = {
         "hidden_size": 1152,
@@ -492,7 +222,7 @@ class Phi4MMImageEncoder(nn.Module):
 
     def forward(self, pixel_values: torch.FloatTensor,
                 image_sizes: torch.Tensor,
-                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
         """
         process image and return vision embeddings.
 
@@ -656,15 +386,49 @@ class Phi4MMImageEncoder(nn.Module):
         for _output_img in output_imgs:
             img_feature_proj = self.img_projection(
                 _output_img.to(target_device).to(target_dtype))
-            img_set_tensor.append(img_feature_proj)
+            img_set_tensor.append(img_feature_proj.squeeze(0))
 
         return img_set_tensor
 
 
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
 class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Tuple[NestedTensors]
-    """Shape: `((batch_size, num_audios, 80, M), )"""
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
 
 
 class Phi4MMAudioEmbeddingInputs(TypedDict):
@@ -673,730 +437,10 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
     """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
 
 
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
 
-def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
-    """Create a Mel filter-bank the same as SpeechLib FbankFC.
-
-    Args:
-        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
-        n_fft (int): FFT size. int > 0 [scalar]
-        n_mel (int): Mel filter size. int > 0 [scalar]
-        fmin (float): lowest frequency (in Hz). If None use 0.0.
-            float >= 0 [scalar]
-        fmax: highest frequency (in Hz). If None use sample_rate / 2.
-            float >= 0 [scalar]
-
-    Returns
-        out (numpy.ndarray): Mel transform matrix
-            [shape=(n_mels, 1 + n_fft/2)]
-    """
-
-    bank_width = int(n_fft // 2 + 1)
-    if fmax is None:
-        fmax = sample_rate / 2
-    if fmin is None:
-        fmin = 0
-    assert fmin >= 0, "fmin cannot be negative"
-    assert (fmin < fmax <=
-            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
-
-    def mel(f):
-        return 1127.0 * np.log(1.0 + f / 700.0)
-
-    def bin2mel(fft_bin):
-        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
-
-    def f2bin(f):
-        return int((f * n_fft / sample_rate) + 0.5)
-
-    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
-    klo = f2bin(fmin) + 1
-    khi = f2bin(fmax)
-
-    khi = max(khi, klo)
-
-    # Spec 2: SpeechLib uses triangles in Mel space
-    mlo = mel(fmin)
-    mhi = mel(fmax)
-    m_centers = np.linspace(mlo, mhi, n_mels + 2)
-    ms = (mhi - mlo) / (n_mels + 1)
-
-    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
-    for m in range(0, n_mels):
-        left = m_centers[m]
-        center = m_centers[m + 1]
-        right = m_centers[m + 2]
-        for fft_bin in range(klo, khi):
-            mbin = bin2mel(fft_bin)
-            if left < mbin < right:
-                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
-
-    return matrix
-
-
-class LogFbankProcessor:
-
-    def __init__(self):
-
-        self._eightk_method = "fillzero"
-        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
-
-        self._hamming400 = np.hamming(400)  # for 16k audio
-        self._hamming200 = np.hamming(200)  # for 8k audio
-
-    def extract_spectrogram(self, wav, fs):
-        """Extract spectrogram features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        if wav.ndim > 1:
-            wav = np.squeeze(wav)
-
-        # by default, we extract the mean if stereo
-        if len(wav.shape) == 2:
-            wav = wav.mean(1)
-
-        # Resample to 16000 or 8000 if needed
-        if fs > 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
-            fs = 16000
-        elif 8000 < fs < 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
-            fs = 8000
-        elif fs < 8000:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-
-        if fs == 8000:
-            if self._eightk_method == "resample":
-                # Input audio is 8 kHz. Convert to 16 kHz before feature
-                # extraction
-                wav = scipy.signal.resample_poly(wav, 2, 1)
-                fs = 16000
-            # Do nothing here for fillzero method
-        elif fs != 16000:
-            # Input audio is not a supported sample rate.
-            raise RuntimeError(
-                f"Input data using an unsupported sample rate: {fs}")
-
-        preemphasis = 0.97
-
-        if fs == 8000:
-            n_fft = 256
-            win_length = 200
-            hop_length = 80
-            fft_window = self._hamming200
-        elif fs == 16000:
-            n_fft = 512
-            win_length = 400
-            hop_length = 160
-            fft_window = self._hamming400
-
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        n_batch = (wav.shape[0] - win_length) // hop_length + 1
-        # Here we don't use stride_tricks since the input array may not satisfy
-        # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to destination
-        # so it is more efficient than broadcasting
-        y_frames = np.array(
-            [
-                wav[_stride:_stride + win_length]
-                for _stride in range(0, hop_length * n_batch, hop_length)
-            ],
-            dtype=np.float32,
-        )
-
-        # Spec 2: SpeechLib applies preemphasis within each batch
-        y_frames_prev = np.roll(y_frames, 1, axis=1)
-        y_frames_prev[:, 0] = y_frames_prev[:, 1]
-        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
-
-        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
-                        axis=1).astype(np.complex64)
-
-        if fs == 8000:
-            # Need to pad the output to look like 16 kHz data but with zeros in
-            # the 4 to 8 kHz bins.
-            frames, bins = S.shape
-            padarray = np.zeros((frames, bins))
-            S = np.concatenate((S[:, 0:-1], padarray),
-                               axis=1)  # Nyquist bin gets set to zero
-
-        spec = np.abs(S).astype(np.float32)
-        return spec
-
-    def extract_features(self, wav, fs):
-        """Extract log filterbank features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        spec = self.extract_spectrogram(wav, fs)
-        spec_power = spec**2
-
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
-        log_fbank = np.log(fbank_power).astype(np.float32)
-
-        return log_fbank
-
-
-@lru_cache
-def audio_feature_extractor() -> LogFbankProcessor:
-    # Creates an instance of the audio processor, needed to extract the
-    # the audio features from the sound file
-    # LRU cache ensures that we only make one copy
-    return LogFbankProcessor()
-
-
-def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                              vit_patch_size, token_compression_factor):
-    """
-    compute the number of tokens an image is expected to take up considering 
-    the image encoder architecture and exclude output features containing 
-    only padding pixels
-
-    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
-    32x32 feature map
-    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-    """
-    assert vit_image_size % vit_patch_size == 0, \
-        "vit_image_size must be divisible by vit_patch_size"
-    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-        "vit_image_size // vit_patch_size must be divisible by "\
-            "token_compression_factor"
-
-    target_aspect_ratio, target_height, target_width = (
-        _find_target_aspect_ratio(image,
-                                  vit_image_size,
-                                  dynamic_hd_size,
-                                  min_num=1))
-    assert target_aspect_ratio[
-        0] * vit_image_size == target_width, \
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-    assert target_aspect_ratio[
-        1] * vit_image_size == target_height, \
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-    assert (target_height % vit_image_size == 0
-            and target_width % vit_image_size == 0)
-
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-    assert padding_width == 0 or padding_height == 0, \
-        "padding_width or padding_height must be 0"
-
-    target_feat_width = target_width // vit_patch_size
-    target_feat_height = target_height // vit_patch_size
-    if padding_width >= vit_patch_size:
-        assert padding_height == 0, "padding_height not 0"
-        non_pad_feat_width = target_feat_width - math.floor(
-            padding_width / vit_patch_size)
-        non_pad_feat_height = target_feat_height
-    elif padding_height >= vit_patch_size:
-        assert padding_width == 0, "padding_width not 0"
-        non_pad_feat_height = target_feat_height - math.floor(
-            padding_height / vit_patch_size)
-        non_pad_feat_width = target_feat_width
-    else:
-        # small padding shorter than a vit patch
-        non_pad_feat_width = target_feat_width
-        non_pad_feat_height = target_feat_height
-
-    feat_width = non_pad_feat_width // token_compression_factor
-    feat_height = non_pad_feat_height // token_compression_factor
-    # NOTE it's possible that the non-padding feature is not divisible
-    if non_pad_feat_width % token_compression_factor != 0:
-        feat_width += 1
-    if non_pad_feat_height % token_compression_factor != 0:
-        feat_height += 1
-    num_hd_patch_tokens = feat_width * feat_height
-    num_hd_newline_tokens = feat_height
-    vit_feature_size = vit_image_size // vit_patch_size
-    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
-    num_sep_tokens = 1
-    num_global_image_newline_tokens = \
-        vit_feature_size // token_compression_factor
-
-    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-            num_hd_newline_tokens + num_global_image_newline_tokens)
-
-
-def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
-    """
-    Compute the output size of the `extract_features` method.
-
-    Args:
-        wav_length (int): Length of the input waveform in samples.
-        fs (int): Sampling rate of the waveform, either 16000 or 8000.
-
-    Returns:
-        tuple (int, int): Output size as (T, D), where:
-            T: Number of time frames.
-            D: Number of Mel filterbank bins (80).
-    """
-
-    # Resample to 16000 or 8000 if needed
-    if fs > 16000:
-        wav_length //= fs // 16000
-        fs = 16000
-    elif 8000 <= fs < 16000:
-        # We'll resample to 16K from 8K
-        wav_length *= 2
-        fs = 16000
-    elif fs < 8000:
-        raise RuntimeError(f"Unsupported sample rate {fs}")
-
-    # Spectrogram parameters for 16 kHz
-    win_length = 400  # Frame length in samples
-    hop_length = 160  # Frame shift in samples
-    mel_bins = 80  # Number of mel filterbank bins
-
-    # Calculate number of frames (T)
-    T = (wav_length - win_length) // hop_length + 1
-    if T < 1:
-        raise ValueError("Waveform too short for given parameters.")
-
-    # Return time frames (T) and mel bins (D)
-    return T, mel_bins
-
-
-def _get_audio_embed_sizes(audios, ctx: InputContext):
-    """
-    Get the audio embedding sizes for each audio file.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-
-    Returns:
-        List[int]: List of audio embedding sizes.
-    """
-    audio_embed_sizes = []
-    for audio in audios:
-        audio_data, sf = audio
-        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
-        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                     audio_frames)
-        audio_embed_sizes.append(audio_embed_size)
-    return audio_embed_sizes
-
-
-def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
-    """
-    The following will search for `<|audio_{idx}|>` tokens and
-    return a mapping of audio placeholder tokens to audio placeholder token ids
-    based on the size of the audio embeddings.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-        prompt_str (str): The prompt string.
-
-    Returns:
-        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
-        placeholder token ids.
-
-    """
-    if len(audios) == 0:
-        return {}
-
-    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
-    audio_ids = [int(audio_id) for audio_id in audio_ids]
-    assert len(audio_ids) == len(
-        audio_embed_sizes
-    ), "Number of audio tokens and audio features do not match"
-    assert tuple(audio_ids) == tuple(range(1,
-                                           len(audio_ids) +
-                                           1)), "Audio ids are not in order!"
-    audio_id_to_input_ids = {
-        f"<|audio_{audio_id}|>":
-        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
-        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
-    }
-
-    return audio_id_to_input_ids
-
-
-def _count_image_tokens(images, ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_token_counts = [
-        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size, token_compression_factor)
-        for image in images
-    ]
-    return image_token_counts
-
-
-def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
-    if len(images) == 0:
-        return {}
-
-    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
-    image_ids = [int(image_id) for image_id in image_ids]
-    assert len(image_ids) == len(
-        set(image_ids)), "Duplicate image tokens in prompt"
-    assert len(images) == len(
-        image_ids), "Number of images and image tokens in prompt do not match"
-
-    # NOTE the following assertion is not strictly necessary
-    assert tuple(image_ids) == tuple(range(1,
-                                           len(image_ids) +
-                                           1)), "Image ids are not in order"
-
-    image_token_counts = _count_image_tokens(images, ctx)
-    image_id_to_input_ids = {
-        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
-        for image_id, num_tokens in zip(image_ids, image_token_counts)
-    }
-    return image_id_to_input_ids
-
-
-def input_processor_for_phi4mm(ctx: InputContext,
-                               inputs: DecoderOnlyInputs) -> TokenInputs:
-    """
-    Implements the input processor, which transforms the input prompt ids
-    to include the audio placeholder token.  This will become the `input_ids`
-    in `forward` for the model.
-
-    Args:
-        ctx (InputContext): Input context.
-        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
-        to process.
-
-    Returns:
-        TokenInputs: Processed inputs
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-    if (multi_modal_data is None or
-        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
-        # pure text input, so no need to do pre-processing
-        return inputs
-
-    prompt_str = inputs.get("prompt")
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    # for offline_inference, we will get str input and we parse MM special
-    # tokens from it
-    # (ignore prompt_token_ids)
-    # for OAI server, we will get prompt_token_ids, where MM special tokens
-    # are already parsed
-
-    if 'audio' in multi_modal_data:
-        audios = multi_modal_data["audio"]
-
-        if not isinstance(audios, list):
-            audios = [audios]
-        if prompt_str is not None:
-            audio_id_to_input_ids = _get_audio_id_to_input_ids(
-                audios, ctx, prompt_str=prompt_str)
-            audio_embed_sizes = []
-        elif prompt_token_ids is not None:
-            audio_id_to_input_ids = {}
-            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    else:
-        audio_id_to_input_ids = {}
-        audio_embed_sizes = []
-
-    if 'image' in multi_modal_data:
-        # PIL Image or list of PIL Images
-        images = multi_modal_data["image"]
-        if not isinstance(images, list):
-            images = [images]
-        if prompt_str is not None:
-            image_id_to_input_ids = _get_image_id_to_input_ids(
-                images, prompt_str, ctx)
-            image_token_counts = []
-        elif prompt_token_ids is not None:
-            image_id_to_input_ids = {}
-            image_token_counts = _count_image_tokens(images, ctx)
-    else:
-        image_id_to_input_ids = {}
-        image_token_counts = []
-
-    # Handle the case where the prompt is a string and we need to manually
-    # tokenize it.
-    # In this case, the `audio_id_to_input_ids` dict will be mapping from
-    # an audio placeholder
-    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
-    # given audio length.
-    if prompt_str:
-        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
-        prompt_chunk_strings = re.split(pattern, prompt_str)
-        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
-
-        # Create the new input_ids with the placeholder image and audio
-        # tokens inserted
-        tokenizer = cached_tokenizer_from_config(ctx.model_config)
-        input_ids = []
-        has_imag, has_audio, has_user_text_input = False, False, False
-        for prompt_chunk_string in prompt_chunk_strings:
-            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
-                has_imag = True
-            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
-                has_audio = True
-            else:
-                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
-                if not has_user_text_input:
-                    for token_id in curr_token_ids:
-                        if token_id not in NON_USER_INPUT_TOKENS:
-                            has_user_text_input = True
-                            break
-                input_ids.extend(curr_token_ids)
-        if has_audio and has_imag and has_user_text_input:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-    # Handle the case where the prompt is already tokenized
-    else:
-        assert prompt_token_ids is not None, \
-            "If string prompt isn't provided, prompt_token_ids must be"
-
-        i = 0
-        input_ids = prompt_token_ids
-        # only needed for later assertion
-        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
-        image_token_count_iter = iter(image_token_counts)
-        audio_embed_size_iter = iter(audio_embed_sizes)
-        while i < len(input_ids):
-            token_id = input_ids[i]
-            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
-                token_count = next(audio_embed_size_iter)
-                audio_cnt += 1
-            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
-                token_count = next(image_token_count_iter)
-                img_cnt += 1
-            else:
-                user_text_input_cnt += 1 if token_id not in \
-                    NON_USER_INPUT_TOKENS else 0
-                i += 1
-                continue
-            tokens = [token_id] * token_count
-            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
-            i += token_count
-
-        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-        # If the below assertion fails, it might be that input pure-text
-        # messages contain image/audio special tokens literally
-        # (<|endoftext10|>, <|endoftext11|>).
-        assert (img_cnt == len(image_token_counts)), (
-            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
-            f"does not match number of images ({len(image_token_counts)})")
-        assert (audio_cnt == len(audio_embed_sizes)), (
-            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
-            f"does not match number of audios ({len(audio_embed_sizes)})")
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=input_ids,
-        prompt=prompt_str,
-        multi_modal_data=multi_modal_data,
-    )
-
-
-def _compute_audio_embed_size(hf_config, audio_frames):
-    """
-    Compute the audio embedding size based on the audio frames and
-    compression rate.
-    """
-    compression_rate = hf_config.embd_layer['audio_embd_layer'][
-        'compression_rate']
-    # NOTE: this is a hard-coded value but might be configurable in the future
-    qformer_compression_rate = 1
-    integer = audio_frames // compression_rate
-    remainder = audio_frames % compression_rate
-
-    result = integer if remainder == 0 else integer + 1
-
-    integer = result // qformer_compression_rate
-    remainder = result % qformer_compression_rate
-    result = integer if remainder == 0 else integer + 1  # qformer compression
-
-    return result
-
-
-def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
-    return 10000
-
-
-def dummy_audio_for_phi4mm(audio_count: int) -> dict:
-    """
-    Create dummy audio data for the Phi4MM model, which is used for profiling.
-
-    Args:
-        audio_count (int): Number of audio samples.
-
-    Returns:
-        dict: Dummy audio data.
-    """
-    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
-    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
-
-
-def dummy_image_for_phi4mm(width: int, height: int):
-    image = Image.new('RGB', (width, height), color='black')
-    return image
-
-
-def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
-                          mm_counts: Mapping[str, int]) -> DummyData:
-    """
-    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
-    which is used for profiling.
-
-    In this case, the sequence data is a bunch of 0s with a number of audio 
-    tokens that correspond to the audio embed size of the 
-    _AUDIO_MAX_SOUNDFILE_SIZE.
-
-    Args:
-        ctx (InputContext): Input context.
-        seq_len (int): Length of the sequence.
-        mm_counts (Mapping[str, int]): Multi-modal counts.
-
-    Returns:
-        Tuple: Dummy sequence data and dummy audio data.
-    """
-    audio_count = mm_counts["audio"]
-    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
-                                                   DUMMY_SAMPLING_FREQUENCY)
-    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                   audio_frames)
-
-    image_count = mm_counts["image"]
-    dummy_image = get_max_dummy_image(ctx)
-    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
-    total_image_tokens = image_count * max_image_tokens
-
-    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
-        raise RuntimeError(
-            f"Phi4MM cannot process {audio_count} audios and {image_count}"
-            f"images in a prompt, please increase max_model_len to be at"
-            f" larger than "
-            f"{audio_feature_size * audio_count + total_image_tokens}"
-            " or reduce audio/image limit by --limit-mm-per-prompt.")
-
-    if audio_feature_size * audio_count > total_image_tokens:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
-            (0, seq_len - audio_feature_size * audio_count),
-        )
-        mm_data = {
-            "audio": dummy_audio_for_phi4mm(audio_count),
-        }
-    else:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
-            (0, seq_len - total_image_tokens),
-        )
-        mm_data = {
-            "image": [dummy_image] * image_count,
-        }
-    return DummyData(seq_data, mm_data)
-
-
-def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalInputs:
-    """
-    This function is used to create the MultiModalInputs for the Phi4MM 
-    (audio) model.
-    Specifically, for audio, we extract the audio features from the sound 
-    file and create pairs of audio features and audio embed lengths (the
-    latter of which is used to repeat the audio placeholder token in the 
-    input prompt IDs).
-    These pairs are used, downstream, in `_audio_features_to_embeddings`
-    (via `_process_audio_input`).
-
-    Note that the incoming audio data (each entry in `data`) is a tuple of 
-    the audio data and the sampling frequency (e.g. from soundfile.read).
-
-    Args:
-        ctx (InputContext): Input context.
-        data (object): Audio data.
-
-    Returns:
-        MultiModalInputs: Multi-modal inputs.
-    """
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalInputs()
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        audio, sf = audio_input
-        feature_extractor = audio_feature_extractor()
-        single_audio_features = feature_extractor.extract_features(audio, sf)
-        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
-                       feature_extractor.stride)
-        audio_frames = len(single_audio_features) * feat_stride
-        single_audio_embed_size = _compute_audio_embed_size(
-            ctx.get_hf_config(), audio_frames)
-        single_audio_feature_audio_len_pair = (
-            single_audio_features,
-            [single_audio_embed_size],
-        )
-        audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalInputs({"audio_features": audio_features})
-
-
-def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-    # data: list of PIL images
-    if len(data) == 0:
-        return MultiModalInputs()
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-
-    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size)
-    return MultiModalInputs({
-        "pixel_values":
-        image_input_dict["pixel_values"],
-        "image_sizes":
-        image_input_dict["image_sizes"],
-        "image_attention_mask":
-        image_input_dict["image_attention_mask"],
-        "num_img_tokens":
-        image_input_dict["num_img_tokens"],
-    })
-
-
 def cat_with_pad(tensors, dim, padding_value=0):
     """
     cat along dim, while pad to max for all other dims
@@ -1423,18 +467,424 @@ def cat_with_pad(tensors, dim, padding_value=0):
     return output
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_phi4mm_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("image",
-                                           input_mapper_for_phi4mm_image)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_phi4mm_audio_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_phi4mm_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
-                        SupportsV0Only):
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    @property
+    def image_tokens(self) -> list[str]:
+        return [f"<|image_{i+1}|>" for i in range(100)]
+
+    @property
+    def audio_tokens(self) -> list[str]:
+        return [f"<|audio_{i+1}|>" for i in range(100)]
+
+    def get_dynamic_hd(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_processor = processor.image_processor
+        return image_processor.dynamic_hd
+
+    def get_feature_extractor(self) -> SequenceFeatureExtractor:
+        return self.get_hf_processor().audio_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_hf_processor().image_processor
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+                image_size,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+        vit_patch_size = prepro_config['vit_patch_size']
+        token_compression_factor = prepro_config['token_compression_factor']
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+            token_compression_factor=token_compression_factor,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the audio embedding size based on the audio frames and
+        compression rate.
+        """
+        hf_config = self.get_hf_config()
+        compression_rate = hf_config.embd_layer['audio_embd_layer'][
+            'compression_rate']
+        # NOTE: this is a hard-coded value but might be configurable
+        # in the future
+        qformer_compression_rate = 1
+        integer = audio_frames // compression_rate
+        remainder = audio_frames % compression_rate
+
+        result = integer if remainder == 0 else integer + 1
+
+        integer = result // qformer_compression_rate
+        remainder = result % qformer_compression_rate
+        # qformer compression
+        result = integer if remainder == 0 else integer + 1
+
+        return result
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+
+        return "".join(image_tokens + audio_tokens)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
+        }
+
+        return mm_data
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
+                                    audio_resample_method="scipy")
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        sr = self.info.get_feature_extractor().sampling_rate
+        if (audio_data := mm_data.get("audios", [])):
+            mm_data['audios'] = [(data, sr) for data in audio_data]
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
+
+        num_img_tokens = [
+            self.info.get_num_image_tokens(image_width=img_size[0],
+                                           image_height=img_size[1])
+            for img_size in processed_outputs["image_sizes"]
+        ]
+        processed_outputs["num_img_tokens"] = num_img_tokens
+
+        audio_features = processed_outputs['input_audio_embeds']
+        feature_sizes = [
+            self.info.get_audio_num_frames(len(audio), sr)
+            for audio in audio_data
+        ]
+        processed_outputs['input_audio_embeds'] = [
+            audio_features[idx, :size]
+            for idx, size in enumerate(feature_sizes)
+        ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_image_embeds=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            input_audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_tokens: list[str] = self.info.image_tokens  # type: ignore
+        audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+        feature_extractor = self.info.get_feature_extractor()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+
+            return audio_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+        num_audios = mm_items.get_count("audio", strict=False)
+
+        image_repl = [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_phi4mm,
+            ) for image_token in image_tokens[:num_images]
+        ]
+        audio_repl = [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_audio_replacement_phi4mm,
+            ) for audio_token in audio_tokens[:num_audios]
+        ]
+        return image_repl + audio_repl
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
@@ -1518,48 +968,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
-
-    def _audio_features_to_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        input_features: List[torch.Tensor],
-        audio_input_sizes: torch.Tensor,
-        audio_projection_mode: str,
-    ) -> torch.Tensor:
-        """
-        Convert audio features to embeddings, which are used as input to the 
-        model (via `inputs_embeds`).
-
-        Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case).
-            input_features (list[torch.Tensor]): Input features (the audio 
-            embeddings).
-            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
-            audio embed lengths to use for padding the audio placeholder token 
-            in the input prompt IDs).
-        """
-        # The audio projection can either be a single linear or Sequential,
-        # so handle both cases
-        if isinstance(self.embed_tokens_extend.audio_projection,
-                      nn.Sequential):
-            target_dtype = self.embed_tokens_extend.audio_projection[
-                0].bias.dtype
-        else:
-            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
-
-        audio_input = [
-            input.unsqueeze(0).to(target_dtype) for input in input_features
-        ]
-        kwargs = {
-            "wte": self.model.embed_tokens,
-            'audio_projection_mode': audio_projection_mode
-        }
-        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
-                                                    audio_input_sizes,
-                                                    **kwargs)
-        audio_embeddings = audio_embeddings.to(target_dtype)
-        return audio_embeddings
+        self.sampler = get_sampler()
 
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
@@ -1574,7 +983,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         Returns:
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
-        audio_features = kwargs.pop("audio_features", None)
+        audio_features = kwargs.pop("input_audio_embeds", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
@@ -1586,7 +995,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
                                  f"Got type: {type(audio_features)}")
 
             return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=audio_features)
+                                            data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -1598,8 +1007,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, input_ids: torch.Tensor,
-                             audio_input: Phi4MMAudioInputs,
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
                              audio_projection_mode: str) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
@@ -1607,8 +1015,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         created by `input_mapper_for_phi4mm_audio`.
 
         Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
-            before the audio token replication).
             audio_input (Phi4MMAudioInputs): Audio input.
 
         Returns:
@@ -1620,21 +1026,20 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         audio_features = audio_input["data"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
-        audio_feature = [i[0] for j in audio_features for i in j]
-        audio_feature_len = [i[1].item() for j in audio_features for i in j]
-        # Add the batch dim via `squeeze`
 
-        return self._audio_features_to_embeddings(
-            input_ids.unsqueeze(0),
-            audio_feature,
-            audio_feature_len,
-            audio_projection_mode,
-        ).squeeze(0)
+        dtype = next(self.embed_tokens_extend.parameters()).dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                features.to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
-        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
-        if pixel_values is None:
+        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
+        if input_image_embeds is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
@@ -1643,23 +1048,24 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         assert image_sizes is not None and image_attention_mask is not None\
               and num_img_tokens is not None, "Missing image inputs"
 
-        if isinstance(pixel_values, list):
-            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+        if is_list_of(input_image_embeds, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in input_image_embeds), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
             # need to pad along num_hd_patches.
             # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            pixel_values = cat_with_pad(pixel_values, dim=0)
-        elif isinstance(pixel_values, torch.Tensor):
+            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
+        elif isinstance(input_image_embeds, torch.Tensor):
             # dimension: batch_size, num_img_per_example, num_hd_patches,
             # channels, height, width.
             # we flatten first 2 dims to make it a single large batch for
             # SigLIP Encoder.
-            assert pixel_values.dim() == 6, "Incorrect image inputs"
-            pixel_values = pixel_values.flatten(0, 1)
+            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
+            input_image_embeds = input_image_embeds.flatten(0, 1)
         else:
-            raise ValueError("Incorrect pixel_values inputs")
+            raise ValueError("Incorrect input_image_embeds inputs")
 
         if isinstance(image_attention_mask, list):
             image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
@@ -1685,80 +1091,140 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         else:
             raise ValueError("Incorrect image_attention_mask inputs")
 
-        return {
-            'pixel_values': pixel_values,
-            'image_sizes': image_sizes,
-            'image_attention_mask': image_attention_mask,
-            'num_img_tokens': num_img_tokens,
-        }
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=input_image_embeds,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
 
-    def merge_image_features_to_inputs_embeds(
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("input_image_embeds",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("input_audio_embeds",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.vision_encoder.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.vision_encoder(pixel_values, image_sizes,
+                                               image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        image_set_tensors: List[torch.Tensor],
-    ):
-        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
-            as_tuple=True)
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
 
-        assert all([t.shape[0] == 1 for t in image_set_tensors
-                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
-        # Shape: (merged_N_tokens, C)
-        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
-        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
-            inputs_embeds.device)
-        merged_embeds = inputs_embeds.index_put(
-            indices=position_tuple,
-            values=image_set_tensor,
-            accumulate=False,
-        )
-        return merged_embeds
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            # Each entry in this is a pair of audio_features and audio_embed
-            # lengths
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
             audio_input = self._parse_and_validate_audio_input(**kwargs)
-            image_inputs = self._parse_and_validate_image_input(**kwargs)
 
-            has_audio = audio_input is not None
-            has_image = image_inputs is not None
-
-            if has_audio:
-                audio_projection_mode = 'vision' if has_image else 'speech'
-                inputs_embeds = self._process_audio_input(
-                    input_ids, audio_input, audio_projection_mode)
-
-            if has_image:
-                dtype = self.vision_encoder.img_processor.embeddings.\
-                    patch_embedding.weight.dtype
-                pixel_values = image_inputs['pixel_values'].to(dtype)
-                image_sizes = image_inputs['image_sizes']
-                image_attention_mask = image_inputs['image_attention_mask']
-                image_set_tensors = self.vision_encoder(
-                    pixel_values, image_sizes, image_attention_mask)
-                if not has_audio:
-                    inputs_embeds = self.model.embed_tokens(input_ids)
-
-                inputs_embeds = self.merge_image_features_to_inputs_embeds(
-                    input_ids, inputs_embeds, image_set_tensors)
-
-            if has_image or has_audio:
-                # multi-modal input, we have set inputs_embeds properly in
-                # previous steps
-                input_ids = None
-            else:
-                # text-only, we keep using original input_ids
+            if image_input is None and audio_input is None:
                 inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
 
         hidden_states = self.model(
             input_ids,
@@ -1802,3 +1268,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
             connector=["audio_projection_for_vision", "audio_projection"],
             tower_model=["vision_encoder", "embed_tokens_extend"],
         )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index db90848f98099..34a7a73d057ae 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1159,8 +1159,11 @@ class AudioEmbedding(nn.Module):
         input_embeds: torch.FloatTensor,
         audio_attention_mask: torch.Tensor = None,
         audio_projection_mode: str = "speech",
-    ):
-
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
         if self.freeze_audio_processor:
             with torch.no_grad():
                 audio_features, masks = self.encoder(input_embeds,
@@ -1210,62 +1213,20 @@ class AudioEmbedding(nn.Module):
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        input_embeds: torch.FloatTensor,
-        audio_embed_sizes,
-        **kwargs,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
     ) -> torch.FloatTensor:
         """
         arguments:
-            input_ids: input text ids (B, U)
-            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+            audio_features: audio features (T, D)
+        
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
         """
-        assert input_embeds is not None and len(input_embeds) == len(
-            audio_embed_sizes)
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        with torch.no_grad():
-            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
-                as_tuple=False)
-
-        if not isinstance(input_embeds, list):
-            input_embeds = [input_embeds]
-
-        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
-        audio_set_tensor = [
-            self.get_audio_features(
-                input_embed, audio_projection_mode=audio_projection_mode)
-            for input_embed in input_embeds
-        ]
-
-        with torch.no_grad():
-            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-
-        if "wte" in kwargs:
-            # we use the token embedding layer from the huggingface model, this
-            # is REQUIRED to make sure we are using the loaded weights.
-            hidden_states = kwargs["wte"](input_ids)
-        else:
-            # otherwise, we use token embedding in pretrained mixformer from
-            # phi team
-            hidden_states = self.wte(input_ids)
-
-        if len(positions.tolist()) > 0:
-            assert sum(audio_embed_sizes) == len(
-                positions
-            ), "please ensure the encoder outputs have the same length as"\
-                " defined in input_ids!"
-            idx = 0
-            for i in range(len(audio_embed_sizes)):
-                cnt = audio_embed_sizes[i]
-                assert audio_set_tensor[i].shape[0] == 1
-                hidden_states[
-                    positions[idx, 0],
-                    positions[idx, 1]:positions[idx, 1] + cnt,
-                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
-                    hidden_states.dtype).to(hidden_states.device))
-                idx += cnt
-
-        return hidden_states
+        audio_embeds = self.get_audio_features(
+            audio_features.unsqueeze(0),
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds.squeeze(0)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index ca00207a9b6f7..9f08a1c4c6f5a 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1766,9 +1766,12 @@ class MultiHeadedAttention(nn.Module):
                 if mask.dtype != q.dtype:
                     attn_mask = attn_mask.to(q.dtype)
 
-            with torch.backends.cuda.sdp_kernel(enable_flash=True,
-                                                enable_math=True,
-                                                enable_mem_efficient=True):
+            with torch.nn.attention.sdpa_kernel([
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            ]):
                 x = torch.nn.functional.scaled_dot_product_attention(
                     q,
                     k,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index f8728acdfbfa0..381a33d98b9cb 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -49,7 +49,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -448,6 +448,8 @@ class PhiMoEModel(nn.Module):
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
+        self.config = config
+        self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
@@ -504,6 +506,88 @@ class PhiMoEModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -585,85 +669,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                    )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a3ad360961243..38e140a91ecf5 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -32,23 +32,22 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
-                     scatter_patch_features, select_patch_features)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
@@ -69,14 +68,6 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class PixtralProcessorAdapter:
     """
@@ -145,11 +136,8 @@ class PixtralProcessorAdapter:
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
-        image_token_id = self.image_token_id
-
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
-        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -158,12 +146,10 @@ class PixtralProcessorAdapter:
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
-            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
-            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -182,13 +168,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_vision_config(
         self,
         processor: Optional[PixtralProcessorAdapter] = None,
@@ -214,7 +193,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -222,39 +201,29 @@ class PixtralProcessingInfo(BaseProcessingInfo):
 
         return ImageSize(width=max_image_size, height=max_image_size)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
                                  ):
@@ -264,10 +233,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            images=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(images=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -291,7 +257,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -382,15 +348,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
-            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -420,22 +380,16 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -447,7 +401,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -467,7 +421,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
@@ -967,23 +920,15 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
             image_width=image_width,
             image_height=image_height,
         )
-
-        # Consider the image_break_token
-        return (ncols + 1) * nrows
-
-    def get_max_image_tokens(self) -> int:
-        image_size = self.get_image_size()
-
-        return self.get_num_image_tokens(
-            image_width=image_size,
-            image_height=image_size,
-        )
+        return ncols * nrows
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
     def get_patch_size(self) -> int:
-        return self.vision_config.patch_size
+        spatial_merge_size = getattr(self.vision_config, "spatial_merge_size",
+                                     1)
+        return (self.vision_config.patch_size * spatial_merge_size)
 
     def get_patch_grid_length(self) -> int:
         image_size, patch_size = self.get_image_size(), self.get_patch_size()
@@ -1005,8 +950,8 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
         ratio = max(image_width / max_width, image_height / max_height)
 
         if ratio > 1:
-            image_width = int(math.ceil(image_width / ratio))
-            image_height = int(math.ceil(image_height / ratio))
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
 
         nrows, ncols = _get_pixtral_hf_num_image_tokens(
             (image_height, image_width),
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
new file mode 100644
index 0000000000000..fb1442526c6ca
--- /dev/null
+++ b/vllm/model_executor/models/plamo2.py
@@ -0,0 +1,746 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only PLaMo2 model."""
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+
+# Only used for type hinting.
+class Plamo2Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo2"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    hidden_size_per_head: int
+    num_key_value_heads: int
+    # Mamba
+    mamba_d_state: int
+    mamba_d_conv: int
+    mamba_num_heads: int
+    mamba_step: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
+
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+def get_initial_dt_bias(num_heads: int) -> torch.Tensor:
+    dt_min = 0.001
+    dt_max = 0.1
+    dt = torch.exp(
+        torch.rand(num_heads) * (math.log(dt_max) - math.log(dt_min)) +
+        math.log(dt_min))
+    dt = torch.clamp(dt, 1e-4)
+    inv_dt = dt + torch.log(-torch.expm1(-dt))
+    return inv_dt
+
+
+def is_mamba(config: Plamo2Config, i: int) -> bool:
+    assert config.mamba_step > 1
+
+    if config.num_hidden_layers <= (config.mamba_step // 2):
+        # use attention in last layer
+        return i != config.num_hidden_layers - 1
+    return (i % config.mamba_step) != (config.mamba_step // 2)
+
+
+# TODO(Shinichi): Replace this with RMSNorm.
+def _rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor,
+              eps: float) -> torch.Tensor:
+    input_shape = hidden_states.shape
+    hidden_states = hidden_states.reshape(input_shape[:-1] + weight.shape)
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    hidden_states = hidden_states.to(input_dtype)
+    hidden_states = weight * hidden_states
+    return hidden_states.reshape(input_shape)
+
+
+def _swiglu(h: torch.Tensor) -> torch.Tensor:
+    h0, h1 = h.chunk(2, dim=-1)
+    return torch.nn.functional.silu(h0) * h1
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class Plamo2MambaMixer(nn.Module):
+    # TODO(Shinichi): Rebase on Mamba2 implementation.
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (config.mamba_num_heads *
+                                  config.hidden_size_per_head)
+        self.hidden_size_per_head = config.hidden_size_per_head
+        self.num_heads = config.mamba_num_heads
+        self.time_step_rank = max(64, self.hidden_size // 16)
+        self.use_conv_bias = False
+        self.use_bias = False
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=self.use_bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        # selective projection used to make dt, B and C input dependent
+        self.bcdt_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+            prefix=f"{prefix}.bcdt_proj",
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(
+            self.time_step_rank,
+            self.num_heads,
+            bias=False,
+            prefix=f"{prefix}.dt_proj",
+        )
+        self.dt_bias = torch.nn.Parameter(get_initial_dt_bias(self.num_heads))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            prefix=f"{prefix}.out_proj",
+        )
+        # The activation function is fixed to SiLU.
+        self.activation = "silu"
+
+        self.dt_norm = RMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0]
+        # Reshaping the projected states as in modeling_plamo.py.
+        length = len(hidden_states)
+        projected_states = projected_states.reshape(length, self.num_heads, -1)
+        gate, hidden_states = torch.split(
+            projected_states,
+            [self.hidden_size_per_head, self.hidden_size_per_head],
+            dim=-1)
+        hidden_states = hidden_states.reshape(length, -1).transpose(0, 1)
+        gate = gate.reshape(length, -1).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.bcdt_proj(hidden_states.transpose(-2, -1))[0]
+
+        # Splitting the ssm_parameters as in modeling_plamo.py.
+        B, C, time_step = torch.split(
+            ssm_parameters,
+            [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
+            dim=-1,
+        )
+        time_step = self.dt_norm(time_step.contiguous())
+        B = self.B_norm(B.contiguous())
+        C = self.C_norm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        # Broadcasting as in modeling_plamo.py.
+        discrete_time_step = discrete_time_step.transpose(
+            0, 1)[..., None].expand(-1, -1, self.hidden_size_per_head)
+        discrete_time_step = discrete_time_step.reshape(
+            -1, self.intermediate_size).transpose(0, 1)
+        time_proj_bias = time_proj_bias[...,
+                                        None].expand(-1,
+                                                     self.hidden_size_per_head)
+        time_proj_bias = time_proj_bias.reshape(self.intermediate_size)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
+
+
+class DenseMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Plamo2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size, [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=False,
+                                           prefix=f"{prefix}.down_proj",
+                                           quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)[0]
+        h = _swiglu(h)
+        output, _ = self.down_proj(h)
+        return output  # type: ignore
+
+
+class Plamo2AttentionMixer(nn.Module):
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size_per_head
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rope_theta = config.rope_theta if hasattr(config,
+                                                       "rope_theta") else 10000
+        self.rope_scaling = config.rope_scaling if hasattr(
+            config, "rope_scaling") else None
+
+        assert max_model_len is not None, "max_model_len must be provided"
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_model_len,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_weight = torch.nn.Parameter(
+            torch.ones((self.num_heads, config.hidden_size_per_head)))
+        self.k_weight = torch.nn.Parameter(
+            torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = _rms_norm(q, self.q_weight, 1e-6)
+        k = _rms_norm(k, self.k_weight, 1e-6)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 layer_idx: int,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        max_model_len = vllm_config.scheduler_config.max_model_len
+
+        self.is_mamba = is_mamba(config, layer_idx)
+        if self.is_mamba:
+            self.mixer = Plamo2MambaMixer(config=config,
+                                          cache_config=cache_config,
+                                          quant_config=quant_config,
+                                          max_model_len=max_model_len,
+                                          prefix=f"{prefix}.mixer")
+        else:
+            self.mixer = Plamo2AttentionMixer(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              max_model_len=max_model_len,
+                                              prefix=f"{prefix}.mixer")
+
+        self.mlp = DenseMLP(config=config,
+                            quant_config=quant_config,
+                            prefix=f"{prefix}.mlp")
+        self.pre_mixer_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.post_mixer_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_mlp_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_mlp_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(
+                hidden_states, residual)
+
+        hidden_states = self.mixer(positions=positions,
+                                   hidden_states=hidden_states,
+                                   residual=residual,
+                                   mamba_cache_params=mamba_cache_params)
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo2Decoder(torch.nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.layers = nn.ModuleList([
+            Plamo2DecoderLayer(vllm_config=vllm_config,
+                               layer_idx=i,
+                               prefix=f"{prefix}.layers.{i}")
+            for i in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+    ) -> torch.Tensor:
+        mamba_cache_index = 0
+        for layer in self.layers:
+            layer_mamba_cache_params = None
+            if layer.is_mamba:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    mamba_cache_index)
+                mamba_cache_index += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params)
+        return hidden_states, residual
+
+
+class Plamo2Model(Plamo2PreTrainedModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config.model_config.hf_config)
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # TODO(Shinichi): Implement pipeline parallelism.
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        hidden_states, residual = self.layers(
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
+                        SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not vllm_config.cache_config.enable_prefix_caching, \
+            "PLaMo2 currently does not support prefix caching"
+
+        super().__init__(config)
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+
+        # ModelConfig.get_head_size assumes head_dim is set or calculated as
+        # hidden_size // num_attention_heads. However, this is not always
+        # the case for PLaMo2, as indicated by the FIXME comment.
+        self.config.head_dim = self.config.hidden_size_per_head
+
+        self.model = Plamo2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.config.vocab_size)
+        self.sampler = get_sampler()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = (self.config.mamba_num_heads *
+                       self.config.hidden_size_per_head)
+        conv_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
+            # at the same time causes dict key access error.
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                assert "lm_head.weight" not in params_dict
+                continue
+
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Broadcast the loaded weight to match the model's parameter shape.
+            if ".A" in name:
+                loaded_weight = loaded_weight[:, None, None].expand(
+                    -1, self.config.hidden_size_per_head,
+                    self.config.mamba_d_state)
+                loaded_weight = loaded_weight.reshape(
+                    -1, self.config.mamba_d_state)
+            elif ".D" in name:
+                loaded_weight = loaded_weight[:, None].expand(
+                    -1, self.config.hidden_size_per_head)
+                loaded_weight = loaded_weight.reshape(-1)
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif "model.norm.weight" in name:
+                loaded_weight += 1.0
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 3f5faea4f875c..c10ef45440b11 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -35,7 +35,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                            PoolingSequenceGroupOutput)
 
@@ -45,27 +45,25 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": 0}
-
 
 class PrithviGeoSpatialMAEInputBuilder(
         BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        return ProcessorInputs(
-            prompt_text="",
-            # This model input is fixed and is in the form of a torch Tensor.
-            # The size of pixel_values might change in the cases where we resize
-            # the input but never exceeds the dimensions below.
-            mm_data={
-                "pixel_values": torch.full((1, 6, 512, 512), 1.0),
-                "location_coords": torch.full((1, 2), 1.0)
-            })
+    ) -> MultiModalDataDict:
+        # This model input is fixed and is in the form of a torch Tensor.
+        # The size of pixel_values might change in the cases where we resize
+        # the input but never exceeds the dimensions below.
+        return {
+            "pixel_values": torch.full((1, 6, 512, 512), 1.0),
+            "location_coords": torch.full((1, 2), 1.0),
+        }
 
 
 class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
@@ -105,6 +103,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             prompt=prompt,
             prompt_token_ids=[1],
             mm_kwargs=MultiModalKwargs(mm_kwargs),
+            mm_hashes=None,
             mm_placeholders={},
         )
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index c4d02e5ddeb15..2831a5a12330b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -263,7 +263,11 @@ class Qwen2DecoderLayer(nn.Module):
     })
 class Qwen2Model(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -297,12 +301,14 @@ class Qwen2Model(nn.Module):
         else:
             self.embed_tokens = PPMissingLayer()
 
+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Qwen2DecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: decoder_layer_type(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
new file mode 100644
index 0000000000000..19bdae1de70af
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -0,0 +1,956 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-Omni model (thinker part)."""
+
+from copy import copy
+from functools import cached_property, partial
+from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence,
+                    Set, Tuple, Union)
+
+import torch
+import torch.nn as nn
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniConfig, Qwen2_5OmniThinkerConfig)
+from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
+    Qwen2_5OmniAudioEncoder)
+from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import (
+    Qwen2_5OmniProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
+from vllm.model_executor.models.qwen2_audio import (
+    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    _get_feat_extract_output_lengths)
+from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.hasher import MultiModalHasher
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+try:
+    import flash_attn
+except (ImportError, ModuleNotFoundError):
+    flash_attn = None
+
+logger = init_logger(__name__)
+
+
+def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
+                                          torch.empty((0, )))
+
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        input_audio_features=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_feature_lengths, dim=1),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+        second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={
+                    "input_audio_features", "audio_feature_lengths"
+                },
+                fields_factory=_qwen2_5_omni_thinker_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
+class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
+                                       Qwen2_5_VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
+
+    def get_hf_processor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[Union[float, List[float]]] = None,
+        **kwargs: object,
+    ) -> Qwen2_5OmniProcessor:
+        if fps is not None:
+            kwargs["fps"] = fps
+        processor = self.ctx.get_hf_processor(
+            Qwen2_5OmniProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size),
+            **kwargs,
+        )
+        if not hasattr(processor, "audio_token"):
+            processor.audio_token = "<|AUDIO|>"
+        if not hasattr(processor, "image_token"):
+            processor.image_token = "<|IMAGE|>"
+        if not hasattr(processor, "video_token"):
+            processor.video_token = "<|VIDEO|>"
+        return processor
+
+    def get_feature_extractor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ):
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None, "video": None}
+
+
+class Qwen2_5OmniThinkerDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+
+        audio_token: str = hf_processor.audio_token
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return (audio_token * num_audios + image_token * num_images +
+                video_token * num_videos)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        feature_extractor = self.info.get_feature_extractor()
+
+        target_audio_length = min(
+            feature_extractor.chunk_length,
+            30,
+        ) * feature_extractor.sampling_rate
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=target_audio_length,
+                                   num_audios=num_audios),
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
+        }
+
+        return mm_data
+
+
+class Qwen2_5OmniThinkerMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return Qwen2_5OmniThinkerMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        if audios:
+            # NOTE: Qwen2.5-Omni processor accept "audio"
+            mm_data["audio"] = audios
+            mm_kwargs = dict(**mm_kwargs, )
+
+        hf_inputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        input_features = hf_inputs.pop('input_features', None)
+        feature_attention_mask = hf_inputs.get('feature_attention_mask', None)
+        if ('input_audio_features' not in hf_inputs
+                and input_features is not None):
+            if feature_attention_mask is not None:
+                input_features = input_features.permute(
+                    0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+            hf_inputs['input_audio_features'] = input_features
+        if ('audio_feature_lengths' not in hf_inputs
+                and feature_attention_mask is not None):
+            hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+        return hf_inputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2_5_omni_thinker_field_config(hf_inputs)
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_items = self._to_mm_items(mm_data)
+
+        # Create MM hashes to be returned (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+
+        if return_mm_hashes:
+            model_id = self.info.model_id
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_update_applied,
+        ) = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
+
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+
+        if is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_updates,
+                prompt_ids,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+        tokenizer = self.info.get_tokenizer()
+        prompt = decode_tokens(tokenizer, prompt_ids)
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
+
+        if use_audio_in_video:
+            mm_kwargs["use_audio_in_video"] = True
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholder_ranges,
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        vocab = tokenizer.get_vocab()
+
+        audio_token = processor.audio_token
+        image_token = processor.image_token
+        video_token = processor.video_token
+        audio_token_id = vocab[audio_token]
+        image_token_id = vocab[image_token]
+        video_token_id = vocab[video_token]
+
+        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        if audio_feature_lengths is None and feature_attention_mask is None:
+            audio_output_lengths = []
+        elif audio_feature_lengths is not None:
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                audio_feature_lengths)
+            audio_output_lengths = audio_output_lens.tolist()
+        elif feature_attention_mask is not None:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+            audio_output_lengths = audio_output_lens.tolist()
+
+        # number of audios read from video.
+        audio_in_video_item_idx = 0
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            item_idx += audio_in_video_item_idx
+
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [audio_token_id] * num_features
+
+        def get_replacement_qwen2_vision(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+            merge_length = image_processor.merge_size**2
+
+            token_id = image_token_id if modality == "image" else video_token_id
+            return [token_id] * (int(grid_thw.prod()) // merge_length)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        thinker_config = self.info.get_hf_config()
+
+        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
+            nonlocal audio_in_video_item_idx
+
+            audio_num_features = audio_output_lengths[audio_in_video_item_idx +
+                                                      item_idx]
+            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+
+            audio_in_video_item_idx += 1
+
+            second_per_grid_ts = hf_processor_mm_kwargs.get(
+                "second_per_grid_ts", None)
+            if second_per_grid_ts:
+                video_second_per_grid_t = second_per_grid_ts[item_idx]
+            else:
+                video_second_per_grid_t = 1.0
+
+            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+                thinker_config=thinker_config,
+                audio_len=audio_num_features,
+                video_grid_thw=video_grid_thw,
+                video_second_per_grid_t=video_second_per_grid_t,
+            )
+
+        video_replacement_fn = (
+            get_replacement_qwen2_use_audio_in_video if use_audio_in_video else
+            partial(get_replacement_qwen2_vision, modality="video"))
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=partial(get_replacement_qwen2_vision,
+                                    modality="image"),
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=video_replacement_fn,
+            ),
+        ]
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Qwen2.5-Omni reimplements this function to handle text only.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_update:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = encode_tokens(tokenizer, prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, False
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_counts = mm_items.get_all_counts()
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        if use_audio_in_video and "video" in mm_counts:
+            assert "audio" in mm_counts
+            mm_counts["audio"] -= mm_counts["video"]
+
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+        use_audio_in_video: bool = False,
+    ) -> None:
+        if use_audio_in_video:
+            mm_item_counts = copy(mm_item_counts)
+            if "video" in mm_item_counts:
+                assert "audio" in mm_item_counts
+                mm_item_counts["audio"] -= mm_item_counts["video"]
+        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+
+class Qwen2_5OmniConditionalGenerationMixin:
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: object,
+                                        name: str,
+                                        dim: int = 0) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input), dim=dim)
+        else:
+            return torch.concat(mm_input, dim=dim)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+        input_audio_features = kwargs.pop('input_audio_features', None)
+        audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
+        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        if input_audio_features is None:
+            return None
+        input_audio_features = self._validate_and_reshape_mm_tensor(
+            input_audio_features, 'input_audio_features', dim=1)
+        if feature_attention_mask is not None:
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+        if not isinstance(input_audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_audio_features)}")
+        return Qwen2AudioInputs(input_features=input_audio_features,
+                                audio_feature_lengths=audio_feature_lengths,
+                                feature_attention_mask=feature_attention_mask)
+
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_audio_input(
+        self,
+        audio_input: Qwen2AudioInputs,
+        audio_hashes: List[str] = None,
+        cached_audio_features: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        input_features = audio_input["input_features"]
+        audio_feature_lengths = audio_input["audio_feature_lengths"]
+        if input_features.ndim == 3:
+            assert input_features.shape[0] == 1
+            input_features = input_features.squeeze(0)
+        if audio_feature_lengths.ndim == 2:
+            assert audio_feature_lengths.shape[
+                0] == 1 or audio_feature_lengths.shape[1] == 1
+            if audio_feature_lengths.shape[0] == 1:
+                audio_feature_lengths = audio_feature_lengths.squeeze(0)
+            else:
+                audio_feature_lengths = audio_feature_lengths.squeeze(1)
+
+        audio_feat_lengths, audio_output_lengths = (
+            self.audio_tower._get_feat_extract_output_lengths(
+                audio_feature_lengths))
+
+        audio_outputs = self.audio_tower(
+            input_features.to(self.audio_tower.dtype),
+            feature_lens=audio_feature_lengths,
+            aftercnn_lens=audio_feat_lengths,
+        )
+        audio_features = audio_outputs.last_hidden_state
+        return audio_features.split(audio_output_lengths.tolist())
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"].type(self.visual.dtype)
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs,
+            video_hashes: List[str] = None,
+            cached_video_embeds: torch.Tensor = None) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5OmniThinkerMultiModalProcessor,
+    info=Qwen2_5OmniThinkerProcessingInfo,
+    dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
+)
+class Qwen2_5OmniThinkerForConditionalGeneration(
+        nn.Module, SupportsMultiModal, SupportsPP,
+        Qwen2_5OmniConditionalGenerationMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "thinker.lm_head.": "language_model.lm_head.",
+            "thinker.model.": "language_model.model.",
+            "thinker.": "",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        thinker_config: Qwen2_5OmniThinkerConfig = (
+            vllm_config.model_config.hf_config.thinker_config)
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = thinker_config
+        self.multimodal_config = multimodal_config
+
+        # force "use_flash_attention_2=True" to audio tower to align
+        # the results.
+        if flash_attn is not None:
+            audio_config = thinker_config.audio_config
+            audio_config._attn_implementation_autoset = True
+            audio_config._attn_implementation = "flash_attention_2"
+        else:
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part.")
+
+        self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
+        self.visual = Qwen2_5_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.quant_config = quant_config
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            hf_config=thinker_config.text_config,
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if input_key in ("input_audio_features"
+                             ) and "audio" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings += audio_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+
+            # TODO (ywang96): support overlapping modalitiy embeddings so that
+            # `use_audio_in_video` will work on V1.
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.config.image_token_index,
+                    self.config.video_token_index,
+                    self.config.audio_token_index
+                ])
+        return inputs_embeds
+
+    def get_multimodal_embeddings_v0(
+            self, **kwargs: object) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        if audio_input is None and image_input is None and video_input is None:
+            return None
+
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(audio_input)
+            multimodal_embeddings.append((audio_embeds, "audio"))
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+        return multimodal_embeddings
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        for embeddings, modality in multimodal_embeddings:
+            if modality == "audio":
+                placeholder_token_id = self.config.audio_token_index
+            if modality == "image":
+                placeholder_token_id = self.config.image_token_index
+            if modality == "video":
+                placeholder_token_id = self.config.video_token_index
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, embeddings, placeholder_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs)
+            inputs_embeds = self.get_input_embeddings_v0(
+                input_ids, multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["talker.", "token2wav."],
+        )
+        loaded_weights = loader.load_weights(weights,
+                                             mapper=self.hf_to_vllm_mapper)
+
+        return loaded_weights
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1e6ff1fec6d5c..0ab55411bad4f 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -38,13 +38,14 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
 
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
@@ -195,6 +196,25 @@ class Qwen2_5_VisionMLP(nn.Module):
         return x_down
 
 
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
 class Qwen2_5_VisionAttention(nn.Module):
 
     def __init__(
@@ -214,10 +234,14 @@ class Qwen2_5_VisionAttention(nn.Module):
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = ColumnParallelLinear(input_size=embed_dim,
-                                        output_size=3 * projection_size,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
         self.proj = RowParallelLinear(input_size=projection_size,
                                       output_size=embed_dim,
                                       quant_config=quant_config,
@@ -236,7 +260,8 @@ class Qwen2_5_VisionAttention(nn.Module):
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
         if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
@@ -694,9 +719,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
@@ -952,26 +977,30 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         return video_embeds.split(sizes.tolist())
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-        return modalities
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -980,14 +1009,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += vision_embeddings
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_input(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
                 multimodal_embeddings += video_embeddings
         return multimodal_embeddings
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index f63bd0a11459a..280cda0f68f1a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,13 +37,14 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -109,42 +110,34 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        hf_config = self.get_hf_config()
-        max_source_positions = hf_config.audio_config.max_source_positions
-        max_output_lengths = (max_source_positions - 2) // 2 + 1
-
-        return {"audio": max_output_lengths}
-
 
 class Qwen2AudioDummyInputsBuilder(
         BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class Qwen2AudioMultiModalProcessor(
         BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
@@ -160,7 +153,7 @@ class Qwen2AudioMultiModalProcessor(
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -229,9 +222,9 @@ class Qwen2AudioMultiModalProcessor(
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails(
-                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
-                features=audio_tokens,
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
             )
 
         return [
@@ -355,6 +348,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 21855ba9dcf87..2700c706b972e 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -55,7 +55,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -329,6 +330,7 @@ class Qwen2MoeModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
+        self.config = config
 
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
@@ -377,60 +379,6 @@ class Qwen2MoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = Qwen2MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -453,8 +401,6 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -531,3 +477,65 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 7537671e1bb82..8c24b8f7df521 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -56,15 +56,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalFieldConfig, MultiModalKwargs,
-                                    VideoItem)
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -720,7 +720,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -734,7 +734,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -806,7 +806,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         max_pixels: Optional[int] = None,
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
-    ):
+    ) -> Qwen2VLImageProcessor:
         return cached_image_processor_from_config(
             self.ctx.model_config,
             **self._get_image_processor_kwargs(min_pixels=min_pixels,
@@ -818,16 +818,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
-        }
-
     def _get_vision_info(
         self,
         *,
@@ -941,10 +931,13 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -954,24 +947,25 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
             image_processor=None,
         )
 
 
 class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -979,12 +973,22 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
 
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
@@ -998,11 +1002,6 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                  ):
@@ -1268,6 +1267,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
new file mode 100644
index 0000000000000..9c14038e61133
--- /dev/null
+++ b/vllm/model_executor/models/qwen3.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Qwen3Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .qwen2 import Qwen2MLP as Qwen3MLP
+from .qwen2 import Qwen2Model
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class Qwen3Attention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 rms_norm_eps: float = 1e-06,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[Tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen3 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Qwen3Model(Qwen2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         decoder_layer_type=Qwen3DecoderLayer)
+
+
+class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
new file mode 100644
index 0000000000000..f0ef79dfdfe28
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -0,0 +1,538 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Qwen3MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        final_hidden_states = final_hidden_states
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen3MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen3MoeSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen3MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4e9d02ae0abdb..9f370d7aab4e4 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -32,12 +32,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -530,13 +531,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.visual
@@ -549,34 +543,34 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
 
 class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        img_start = hf_processor.image_start_tag
+        img_end = hf_processor.image_end_tag
+
+        return "".join(f"Picture {i}: {img_start}{img_end}\n"
+                       for i in range(1, num_images + 1))
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.visual
 
-        processor = self.info.get_hf_processor()
-        img_start = processor.image_start_tag
-        img_end = processor.image_end_tag
-
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
-                                for i in range(1, num_images + 1)),
-            mm_data=mm_data,
-        )
-
 
 class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
 
@@ -647,9 +641,9 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
                 ),
             )
         ]
@@ -740,6 +734,9 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
 
         return self.transformer.visual(image_input["data"])
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7c8e506713833..621b9d69faa5e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -21,9 +21,10 @@ import torch.nn as nn
 from vllm.logger import init_logger
 from vllm.utils import is_in_doc_build
 
-from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
-                         supports_cross_encoding, supports_multimodal,
-                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
+                         supports_multimodal, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
@@ -34,6 +35,7 @@ _TEXT_GENERATION_MODELS = {
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name
@@ -41,10 +43,11 @@ _TEXT_GENERATION_MODELS = {
     "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
+    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
@@ -55,6 +58,7 @@ _TEXT_GENERATION_MODELS = {
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
@@ -95,9 +99,12 @@ _TEXT_GENERATION_MODELS = {
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
+    "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
+    "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
+    "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
@@ -115,13 +122,11 @@ _TEXT_GENERATION_MODELS = {
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
-    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
-    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "GteModel": ("bert", "GteEmbeddingModel"),
     "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
@@ -131,12 +136,16 @@ _EMBEDDING_MODELS = {
         if arch == "LlamaForCausalLM"
     },
     "MistralModel": ("llama", "LlamaForCausalLM"),
+    "NomicBertModel": ("bert", "NomicBertEmbeddingModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
+    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
@@ -155,11 +164,14 @@ _CROSS_ENCODER_MODELS = {
                                          "RobertaForSequenceClassification"),
     "XLMRobertaForSequenceClassification": ("roberta",
                                             "RobertaForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
 }
 
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
@@ -169,6 +181,8 @@ _MULTIMODAL_MODELS = {
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
+    "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
@@ -176,6 +190,7 @@ _MULTIMODAL_MODELS = {
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
@@ -185,23 +200,27 @@ _MULTIMODAL_MODELS = {
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_MODELS = {
     "EAGLEModel": ("eagle", "EAGLE"),
+    "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
-_FALLBACK_MODEL = {
-    "TransformersModel": ("transformers", "TransformersModel"),
+_TRANSFORMERS_MODELS = {
+    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
 
@@ -211,7 +230,7 @@ _VLLM_MODELS = {
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 # This variable is used as the args for subprocess.run(). We
@@ -234,6 +253,7 @@ class _ModelInfo:
     has_inner_state: bool
     is_attention_free: bool
     is_hybrid: bool
+    has_noops: bool
     supports_transcription: bool
     supports_v0_only: bool
 
@@ -251,6 +271,7 @@ class _ModelInfo:
             is_hybrid=is_hybrid(model),
             supports_transcription=supports_transcription(model),
             supports_v0_only=supports_v0_only(model),
+            has_noops=has_noops(model),
         )
 
 
@@ -423,9 +444,9 @@ class _ModelRegistry:
         normalized_arch = list(
             filter(lambda model: model in self.models, architectures))
 
-        # make sure Transformers fallback are put at the last
+        # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersModel")
+            normalized_arch.append("TransformersForCausalLM")
         return normalized_arch
 
     def inspect_model_cls(
@@ -510,6 +531,13 @@ class _ModelRegistry:
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_hybrid
 
+    def is_noops_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_noops
+
     def is_transcription_model(
         self,
         architectures: Union[str, List[str]],
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba92eef12707c..4c23d72a41952 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
-from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
@@ -22,30 +22,6 @@ from vllm.transformers_utils.config import (
 from .interfaces import SupportsCrossEncoding, SupportsV0Only
 
 
-def roberta_task_weights_filter(
-    all_weights: Iterable[Tuple[str, torch.Tensor]]
-) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
-                                                              torch.Tensor]]]:
-    """
-    Separate task-specific weights that are applied on top
-    of the encoder-decoder bert base.
-    To do so, return two generators over the original iterator.
-    Also, remove the "roberta." prefix to make it loadable
-    from vanilla BertModel.
-    """
-    # Copy of a lazy iterator without in-memory overhead so both
-    # iterators can be iterated upon independently.
-    all_weights1, all_weights2 = itertools.tee(all_weights)
-
-    def encoder_decoder_weights():
-        for name, weight in all_weights1:
-            if name.startswith("roberta."):
-                yield (name[len("roberta."):], weight)
-
-    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
-                                       if not n.startswith("roberta."))
-
-
 class RobertaEmbedding(nn.Module):
 
     def __init__(self, config: RobertaConfig):
@@ -119,30 +95,6 @@ class RobertaEmbedding(nn.Module):
         return embeddings
 
 
-# Adapted from transformers
-def create_position_ids_from_input_ids(input_ids,
-                                       padding_idx,
-                                       past_key_values_length=0):
-    """
-    Replace non-padding symbols with their position numbers.
-    Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-
-    Args:
-        x: torch.Tensor x:
-
-    Returns: torch.Tensor
-    """
-    # The series of casts and type-conversions here are carefully
-    # balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-
-    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
-                           past_key_values_length) * mask
-
-    return incremental_indices.long() + padding_idx
-
-
 # Adapted from transformers
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
@@ -174,15 +126,38 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
     def _build_model(self,
                      vllm_config: VllmConfig,
                      prefix: str = "") -> BertModel:
-        return BertModel(vllm_config=vllm_config,
-                         prefix=prefix,
-                         embedding_class=RobertaEmbedding)
+        if (vllm_config.model_config.hf_config.position_embedding_type ==
+                "rotary"):
+            config = vllm_config.model_config.hf_config
+            head_dim = config.hidden_size // config.num_attention_heads
+
+            rotary_kwargs = {
+                "head_size": head_dim,
+                "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+                "max_position": config.max_position_embeddings,
+                "base": config.rotary_emb_base,
+                "rope_scaling": getattr(config, "rope_scaling", None)
+            }
+
+            return BertModel(vllm_config=vllm_config,
+                             rotary_kwargs=rotary_kwargs,
+                             prefix=prefix)
+        else:
+            return BertModel(vllm_config=vllm_config,
+                             prefix=prefix,
+                             embedding_class=RobertaEmbedding)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        if getattr(self.config, "lora_rank", 0) > 0:
+            scaling = self.config.lora_alpha / self.config.lora_rank
+            weights = jina_merge_lora_weights(weights, scaling)
+
         weights = self.hf_to_vllm_mapper.apply(weights)
         # Separate weights in "roberta"-prefixed and all else (not in memory).
         # For use with models like FacebookAI/roberta-base.
         bert_weights, task_weights = roberta_task_weights_filter(weights)
+        bert_weights = jina_to_vllm_mapper.apply(bert_weights)
+
         loaded = self.model.load_weights(bert_weights)
         if not len(loaded):
             # Fix for models like `sentence-transformers/stsb-roberta-base-v2`
@@ -219,8 +194,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         self._pooler = CrossEncodingPooler(config, self.classifier)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
         bert_weights, task_weights = roberta_task_weights_filter(weights)
+        bert_weights = jina_to_vllm_mapper.apply(bert_weights)
+
         self.roberta.load_weights(bert_weights)
 
         params_dict = dict(self.named_parameters())
@@ -252,3 +228,105 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
                             inputs_embeds=inputs_embeds,
                             intermediate_tensors=intermediate_tensors,
                             token_type_ids=token_type_ids)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+
+    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
+                           past_key_values_length) * mask
+
+    return incremental_indices.long() + padding_idx
+
+
+def roberta_task_weights_filter(
+    all_weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
+                                                              torch.Tensor]]]:
+    """
+    Separate task-specific weights that are applied on top
+    of the encoder-decoder bert base.
+    To do so, return two generators over the original iterator.
+    Also, remove the "roberta." prefix to make it loadable
+    from vanilla BertModel.
+    """
+    # Copy of a lazy iterator without in-memory overhead so both
+    # iterators can be iterated upon independently.
+    all_weights1, all_weights2 = itertools.tee(all_weights)
+
+    def encoder_decoder_weights():
+        for name, weight in all_weights1:
+            if name.startswith("roberta."):
+                yield (name[len("roberta."):], weight)
+
+    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
+                                       if not n.startswith("roberta."))
+
+
+jina_to_vllm_mapper = WeightsMapper(
+    orig_to_new_substr={
+        'emb_ln': "embeddings.LayerNorm",
+        'layers': "layer",
+        'mixer.Wqkv': "attention.self.qkv_proj",
+        'mixer.out_proj': "attention.output.dense",
+        'norm1': "attention.output.LayerNorm",
+        'mlp.fc1': "intermediate.dense",
+        'mlp.fc2': "output.dense",
+        'norm2': "output.LayerNorm",
+    })
+
+
+@torch.inference_mode()
+def jina_merge_lora_weights(weights: Iterable[Tuple[str, torch.Tensor]],
+                            scaling: float = 1.0):
+    # use for jina-embeddings-v3
+    # Merge Lora weights into a single weight tensor.
+    # This is a temporary solution until we have a better way to handle
+
+    weights = {name: weight for name, weight in weights}
+
+    o = ".original"
+    a = ".0.lora_A"
+    b = ".0.lora_B"
+
+    # text-matching
+    i = -1
+
+    for name in list(weights.keys()):
+        if o in name:
+            dtype = weights[name].dtype
+            shape = weights[name].shape
+            weight_name = name[:-len(o)]
+
+            if "embeddings" in weight_name:
+                B = weights[weight_name + a][i].cuda().float()
+                A = weights[weight_name + b][i].cuda().float()
+            else:
+                B = weights[weight_name + b][i].cuda().float()
+                A = weights[weight_name + a][i].cuda().float()
+
+            weight = (weights[weight_name + o].cuda() +
+                      torch.matmul(B, A).view(shape) * scaling)
+            weight = weight.cpu().to(dtype)
+
+            weights[weight_name.replace(".parametrizations", "")] = weight
+
+            del weights[weight_name + o], weights[weight_name +
+                                                  a], weights[weight_name + b]
+
+    return [(name, weight) for name, weight in weights.items()]
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 518dbc73f8c54..75fcf540b0b12 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -33,9 +33,6 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
     ) -> int:
         return self.get_patch_grid_length()**2
 
-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2
-
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
@@ -208,8 +205,10 @@ class SiglipMLP(nn.Module):
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        # Special handling for BNB quantization
-        if quant_config and quant_config.get_name() == "bitsandbytes":
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in [
+                "bitsandbytes", "torchao"
+        ]:
             quantizable = True
         else:
             # For other quantization, we require the hidden size to be a
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
new file mode 100644
index 0000000000000..19a23162aa840
--- /dev/null
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -0,0 +1,965 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+class SkyworkR1VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class SkyworkR1VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
+                              SkyworkR1VImageEmbeddingInputs]
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class BaseSkyworkR1VProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+
+class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> BaseSkyworkR1VProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseSkyworkR1VProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
+
+
+class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_skyworkr1v(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_skyworkr1v,
+            )
+        ]
+
+
+class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> SkyworkR1VProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    SkyworkR1VMultiModalProcessor,
+    info=SkyworkR1VProcessingInfo,
+    dummy_inputs=SkyworkR1VDummyInputsBuilder)
+class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'SkyworkLM2VEForCausalLM'
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            is_mono=self.is_mono,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            return InternVisionPatchModel(config.vision_config)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            ReplicatedLinear(vit_hidden_size *
+                             int(1 / self.downsample_ratio)**2,
+                             llm_hidden_size,
+                             return_bias=False),
+            nn.GELU(),
+            ReplicatedLinear(llm_hidden_size,
+                             llm_hidden_size,
+                             return_bias=False),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return SkyworkR1VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return SkyworkR1VImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: SkyworkR1VImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if self.is_mono:
+            self.visual_token_mask = (
+                input_ids == self.img_context_token_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.img_context_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[SamplerOutput, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
new file mode 100644
index 0000000000000..17217dc9a2470
--- /dev/null
+++ b/vllm/model_executor/models/smolvlm.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Optional
+
+from transformers import SmolVLMProcessor
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+# yapf: disable
+from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
+from .idefics3 import Idefics3ForConditionalGeneration
+from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
+from .idefics3 import Idefics3ProcessingInfo
+
+# yapf: enable
+
+
+class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        max_image_size: Optional[Dict[str, int]] = None,
+        **kwargs: object,
+    ) -> SmolVLMProcessor:
+        if max_image_size is not None:
+            kwargs["max_image_size"] = max_image_size
+
+        return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
+
+    def _get_image_token(
+            self, processor: Optional[SmolVLMProcessor]) -> tuple[str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_token = processor.image_token
+        fake_image_token = processor.fake_image_token
+        global_image_token = processor.global_image_token
+        return image_token, fake_image_token, global_image_token
+
+
+@MULTIMODAL_REGISTRY.register_processor(SmolVLMMultiModalProcessor,
+                                        info=SmolVLMProcessingInfo,
+                                        dummy_inputs=SmolVLMDummyInputsBuilder)
+class SmolVLMForConditionalGeneration(Idefics3ForConditionalGeneration):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+        )
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a15faec547b95..53f520304abc4 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -44,7 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -253,6 +253,45 @@ class StableLMEpochModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
 
@@ -308,46 +347,13 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=[
+                "rotary_emb.inv_freq", "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 3d11dfd779210..8b9fb7cb7bc6e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -45,7 +45,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -256,6 +256,41 @@ class Starcoder2Model(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
@@ -319,41 +354,12 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=([
+                "rotary_emb.inv_freq", "lm_head.weight"
+            ] if self.config.tie_word_embeddings else ["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index a38035e37ec73..379e19e1beea1 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -22,11 +22,13 @@
 from typing import Iterable, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
 
+from .llama import LlamaDecoderLayer
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
                     is_pp_missing_parameter)
 
@@ -120,7 +122,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
         },
     )
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
     def load_weights(self, weights: Iterable[Tuple[str,
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index e670b1df08f70..e05f23f99e979 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -22,9 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Type
-
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -39,7 +38,7 @@ class TeleFLMModel(LlamaModel):
         *,
         vllm_config: VllmConfig,
         prefix: str = "",
-        layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
     ):
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fe6a9d7a4aa43..212e35e79d452 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -24,6 +24,7 @@ from transformers import AutoModel, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, VllmConfig)
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -42,7 +43,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, maybe_prefix)
 
 logger = init_logger(__name__)
@@ -82,7 +84,7 @@ def replace_linear_class(
 ) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
-    
+
     Args:
         linear (nn.Linear): `nn.Linear` to be replaced.
         style (str): Tensor parallel style of the new linear, e.g. "colwise".
@@ -109,12 +111,9 @@ def replace_linear_class(
     )
 
 
-class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
-    embedding_modules = ["embed_tokens"
-                         ]  # TODO transformers will have a util to get it
+class TransformersModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         logger.info("Using Transformers backend.")
 
@@ -132,9 +131,6 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         self.parallel_config = parallel_config
         self.quant_config = quant_config
 
-        self.vocab_size = model_config.get_vocab_size()
-        self.unpadded_vocab_size = model_config.get_vocab_size()
-
         self.pp_group = get_pp_group()
         self.pp_size = self.pp_group.world_size
         self.pp_rank = self.pp_group.rank_in_group
@@ -142,13 +138,15 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
         # Use meta device to delay allocating GPU tensors
         with torch.device("meta"):
+            # FIXME(Isotr0py): We need to refactor this part in the future to
+            # avoid registering an extra model layer, otherwise we will need a
+            # weights mapper to rename weights.
             self.model: PreTrainedModel = AutoModel.from_config(
                 config,
                 attn_implementation="vllm",
                 torch_dtype=model_config.dtype,
                 trust_remote_code=model_config.trust_remote_code,
             )
-        prefix = self.model.base_model_prefix
 
         self.pipeline_parallel()
         self.tensor_parallel()
@@ -166,32 +164,12 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         # Attention layers
         self.attention_instances = self.create_attention_instances()
 
-        # Output embeddings
-        if not isinstance(getattr(self, "lm_head", None), PPMissingLayer):
-            self.unpadded_vocab_size = config.vocab_size
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.get_input_embeddings())
-
-            logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
-
         # Initialize buffers (e.g. rotary embedding inverse frequency)
         self.init_buffers(self.model)
 
         # Move remaining meta tensors to device (should happen last)
         self.meta_to_empty(self.model)
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
@@ -246,15 +224,15 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
             if not self.pp_group.is_last_rank:
                 setattr(self.model, name, PPMissingLayer())
 
-        if not self.pp_group.is_last_rank:
-            self.lm_head = PPMissingLayer()
-
     def tensor_parallel(self):
         """
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        if self.tp_size > 1 and self.config.base_model_tp_plan is None:
+        if not self.model.supports_tp_plan:
+            if self.tp_size <= 1:
+                return
+
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 
@@ -329,6 +307,9 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         for child in module.children():
             self.meta_to_empty(child)
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -359,6 +340,92 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params = set[str]()
+        for name, loaded_weight in weights:
+            # Use "model" instead of base_model_prefix because
+            # the base model attribute in vLLM is always `model`
+            if not name.startswith(prefix := "model."):
+                name = prefix + name
+
+            if is_pp_missing_parameter(name, self):
+                continue
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile
+class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
+                              SupportsPP):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+
+        self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend,
+    # this makes thing complicated. We need to remove this mapper after refactor
+    # `TransformersModel` in the future.
+    @property
+    def hf_to_vllm_mapper(self):
+        prefix_mapper = {
+            name: "model." + name
+            for name, _ in self.model.model.named_children()
+        }
+        return WeightsMapper(
+            orig_to_new_substr={"model.": "model.model."},
+            orig_to_new_prefix=prefix_mapper,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
@@ -376,18 +443,9 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params = set[str]()
-        for name, loaded_weight in weights:
-            # Necessary for some models which use remote code
-            if not name.startswith(prefix := self.model.base_model_prefix):
-                name = maybe_prefix(prefix, name)
-            if is_pp_missing_parameter(name, self):
-                continue
-            if name in params_dict:
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-                loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index cb1e143838496..cb5ff4ed6365b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,13 +2,11 @@
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
 from transformers import BatchFeature, ProcessorMixin
@@ -25,13 +23,13 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -108,26 +106,20 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        feature_extractor = self.get_feature_extractor()
-        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
-                                     _AUDIO_TOKENS_PER_SECOND)
-
-        return {"audio": max_audio_tokens * _MAX_ENCODER_BATCH_SIZE}
-
 
 class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                                  ):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|audio|>" * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
@@ -135,16 +127,11 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                      _MAX_ENCODER_BATCH_SIZE)
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|audio|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class UltravoxMultiModalProcessor(
         BaseMultiModalProcessor[UltravoxProcessingInfo]):
@@ -160,7 +147,7 @@ class UltravoxMultiModalProcessor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(
                 prompt, add_special_tokens=False)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
@@ -564,6 +551,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         ]
         return flattened_embeddings.split(embed_lens)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1e3d78c7f6fd7..7ed0560ee43fe 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,12 +10,14 @@ import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
+from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
+                        is_uva_available)
 
 logger = init_logger(__name__)
 
@@ -156,6 +158,26 @@ class AutoWeightsLoader:
 
             yield weight_qualname
 
+    def _add_loadable_non_param_tensors(self, module: nn.Module,
+                                        child_params: Dict[str, torch.Tensor]):
+        """
+        Add tensor names that are not in the model params that may be in the
+        safetensors, e.g., batch normalization stats.
+        """
+        if isinstance(module, (
+                nn.BatchNorm1d,
+                nn.BatchNorm2d,
+                nn.BatchNorm3d,
+                nn.LazyBatchNorm1d,
+                nn.LazyBatchNorm2d,
+                nn.LazyBatchNorm3d,
+                nn.SyncBatchNorm,
+        )):
+            module_state_dict = module.state_dict()
+            for stat_name in ("running_mean", "running_var",
+                              "num_batches_tracked"):
+                child_params[stat_name] = module_state_dict[stat_name]
+
     def _load_module(
         self,
         base_prefix: str,
@@ -184,6 +206,10 @@ class AutoWeightsLoader:
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
 
+        # Add missing tensors the weight loader needs to be able to load
+        # that aren't registered as params, e.g., batchnorm statistics.
+        self._add_loadable_non_param_tensors(module, child_params)
+
         for child_prefix, child_weights in self._groupby_prefix(weights):
             prefix = self._get_qualname(base_prefix, child_prefix)
 
@@ -495,7 +521,10 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
 
 
 def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
-    device = next(module.parameters()).device
+    if (params := next(module.parameters(), None)) is None:
+        return module
+
+    device = params.device
 
     if device == torch.device("cpu"):
         return module
@@ -505,6 +534,14 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     pin_memory = is_pin_memory_available()
+    uva_available = is_uva_available()
+
+    if envs.VLLM_USE_V1:
+        assert uva_available, ("V1 CPU offloading requires"
+                               " uva (pin memory) support")
+        uva_offloading = True
+    else:
+        uva_offloading = False
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
@@ -523,11 +560,16 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
                                        device='cpu',
                                        pin_memory=pin_memory)
         cpu_data.copy_(p.data)
-        p.data = cpu_data
+        if not uva_offloading:
+            p.data = cpu_data
+        else:
+            # keep the cpu data alive
+            p._vllm_offloaded_cpu_data = cpu_data
+            p.data = get_cuda_view_from_cpu_tensor(cpu_data)
         _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
         offloaded_parameters = True
 
-    if offloaded_parameters:
+    if offloaded_parameters and not uva_offloading:
         original_forward = module.forward
 
         def forward(*args, **kwargs):
@@ -661,3 +703,12 @@ def cast_overflow_tensors(
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
     return tensors
+
+
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        return torch.topk(values, topk, dim=dim)
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 250b0ee3c2a1b..05e3b3f3ccdf3 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -9,12 +9,9 @@ from transformers import PretrainedConfig
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
-from .interfaces import MultiModalEmbeddings
-
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -36,10 +33,6 @@ class VisionEncoderInfo(ABC, Generic[_C]):
     ) -> int:
         raise NotImplementedError
 
-    @abstractmethod
-    def get_max_image_tokens(self) -> int:
-        raise NotImplementedError
-
     @abstractmethod
     def get_image_size(self) -> int:
         raise NotImplementedError
@@ -68,6 +61,9 @@ def get_vision_encoder_info(
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPEncoderInfo(vision_config)
     if isinstance(vision_config, PixtralVisionConfig):
+        # Need to sneak in spatial_merge_size for Mistral3
+        vision_config.spatial_merge_size = getattr(hf_config,
+                                                   "spatial_merge_size", 1)
         return PixtralHFEncoderInfo(vision_config)
     if isinstance(vision_config, SiglipVisionConfig):
         return SiglipEncoderInfo(vision_config)
@@ -151,69 +147,3 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
-
-
-def scatter_patch_features(
-    features: torch.Tensor,
-    embed_is_patch: torch.Tensor,
-) -> tuple[torch.Tensor, ...]:
-    """
-    Scatter the patch features into a contiguous tensor that corresponds
-    to the embedding tokens defined by the multimodal processor.
-    
-    The rest of the values in the tensor are set to NaN so that they
-    can be filtered out by :func`select_patch_features`.
-
-    Args:
-        features: The patch features, concatenated across each image.
-          Shape: `(num_patch, feature_depth)`
-        embed_is_patch: A boolean mask indicating which image embeddings
-          correspond to patch tokens for each image.
-          Shape: `(num_images, num_embeds)`
-
-    Note:
-        The original code only considers patch tokens as feature
-        tokens, but our processor considers all image-related tokens
-        as feature tokens because the feature tokens need to be
-        consecutive in `input_ids`.
-
-    Example:
-        A simplified example for one image:
-
-        .. code-block::
-
-            Embedding tokens (from HF processor):
-            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-            embed_is_patch (from HF processor):
-            [ False   True    True    False    True    True   False  False ]
-
-            Encoder outputs (from model):
-            [  p1      p2      p3      p4   ]
-
-            The resulting embedding tensor is:
-            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-    """
-    num_images, num_embeds = embed_is_patch.shape
-    num_embeds_per_image = [num_embeds] * num_images
-
-    embeds_flat = features.new_full(
-        (sum(num_embeds_per_image), features.shape[-1]),
-        fill_value=torch.nan,
-    )
-    embeds_flat[embed_is_patch.view(-1)] = features.flatten(0, -2)
-
-    return embeds_flat.split(num_embeds_per_image)
-
-
-def select_patch_features(
-        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
-    """
-    Given the outputs of :func:`scatter_patch_features`, return only
-    the values that correspond to patch features.
-    """
-    selected_features = json_map_leaves(
-        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-        cast(JSONTree[torch.Tensor], multimodal_embeddings),
-    )
-    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index eb6404922c6d0..63e71f2688057 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -26,13 +26,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
-from vllm.multimodal.parse import (MultiModalDataDict, MultiModalDataItems,
-                                   MultiModalDataParser)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                          SupportsTranscription, SupportsV0Only)
@@ -538,40 +538,33 @@ class WhisperProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_max_audio_tokens(self) -> int:
+    def get_num_audio_tokens(self) -> int:
         return self.get_hf_config().max_source_positions
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"audio": self.get_max_audio_tokens()}
-
 
 class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|startoftranscript|>" * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|startoftranscript|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class WhisperMultiModalProcessor(
         EncDecMultiModalProcessor[WhisperProcessingInfo]):
@@ -580,6 +573,10 @@ class WhisperMultiModalProcessor(
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return True
+
     def create_encoder_prompt(
         self,
         prompt: Union[str, list[int]],
@@ -626,7 +623,7 @@ class WhisperMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
-        num_tokens = self.info.get_max_audio_tokens()
+        num_tokens = self.info.get_num_audio_tokens()
         return [
             PromptReplacement(
                 modality="audio",
@@ -688,6 +685,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         )
         return decoder_outputs
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model.decoder
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 7e210244f794d..ea21fffaede56 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -25,6 +25,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -39,7 +41,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
-from .utils import maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class Zamba2LoRA(nn.Module):
@@ -495,7 +497,6 @@ class Zamba2MambaDecoderLayer(nn.Module):
             head_dim=intermediate_size // config.n_mamba_heads,
             rms_norm_eps=config.rms_norm_eps,
             activation="silu",
-            chunk_size=config.chunk_size,
             quant_config=quant_config,
         )
 
@@ -507,7 +508,7 @@ class Zamba2MambaDecoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
         transformer_hidden_states: Optional[torch.Tensor] = None,
         positions: Optional[torch.Tensor] = None,
         original_hidden_states: Optional[torch.Tensor] = None,
@@ -547,7 +548,7 @@ class Zamba2MambaDecoderLayer(nn.Module):
         hidden_states = self.mamba(
             hidden_states,
             mamba_cache_params=mamba_cache_params,
-            sequence_idx=sequence_idx,
+            mamba2_metadata=mamba2_metadata,
         )
 
         # residual connection after mamba
@@ -594,8 +595,8 @@ class Zamba2HybridLayer(nn.Module):
         hidden_states: torch.Tensor,
         original_hidden_states: torch.Tensor,
         positions: torch.Tensor,
-        mamba_cache_params: Optional[MambaCacheParams] = None,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
     ) -> torch.Tensor:
         """Forward pass through the hybrid layer.
         
@@ -634,7 +635,7 @@ class Zamba2HybridLayer(nn.Module):
             hidden_states,
             transformer_hidden_states=transformer_hidden_states,
             mamba_cache_params=mamba_cache_params,
-            sequence_idx=sequence_idx,
+            mamba2_metadata=mamba2_metadata,
         )
 
         return layer_outputs
@@ -747,20 +748,13 @@ class Zamba2Model(nn.Module):
             inputs_embeds = self.get_input_embeddings(input_ids)
         hidden_states = inputs_embeds
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         # Process through layers
         original_hidden_states = torch.clone(hidden_states)
@@ -770,13 +764,44 @@ class Zamba2Model(nn.Module):
                 original_hidden_states=original_hidden_states,
                 positions=positions,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(layer_idx),
-                sequence_idx=seq_idx,
+                mamba2_metadata=mamba2_metadata,
             )
             hidden_states = layer_outputs
 
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for chkpt_weight_name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in chkpt_weight_name:
+                    continue
+                chkpt_weight_name = chkpt_weight_name.replace(
+                    weight_name, param_name)
+                param = params_dict[chkpt_weight_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if chkpt_weight_name not in params_dict:
+                    continue
+                param = params_dict[chkpt_weight_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(chkpt_weight_name)
+        return loaded_params
+
 
 class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     """Zamba2 model with causal language modeling head.
@@ -787,6 +812,12 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     - Support for model parallelism and quantization
     - Sampling capabilities for text generation
     """
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        "A_log": "A",
+        "0.weight": "A.weight",
+        "1.weight": "B.weight",
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         """Initialize the Zamba2 model for causal language modeling.
@@ -992,40 +1023,5 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        weights_dict = {}
-        for key, loaded_weight in weights:
-            if "A_log" in key:
-                key = key.replace("A_log", "A")
-            elif "adapter_list" in key:
-                key = key.replace("0.weight", "A.weight")
-                key = key.replace("1.weight", "B.weight")
-            weights_dict[key] = loaded_weight
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for chkpt_weight_name, loaded_weight in weights_dict.items():
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in chkpt_weight_name:
-                    continue
-                chkpt_weight_name = chkpt_weight_name.replace(
-                    weight_name, param_name)
-                param = params_dict[chkpt_weight_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if chkpt_weight_name not in params_dict:
-                    continue
-                param = params_dict[chkpt_weight_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(chkpt_weight_name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 2b1294bf7baa3..34a0b527b585e 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -282,10 +282,12 @@ class PackedColumnParameter(_ColumnvLLMParameter):
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -300,12 +302,17 @@ class PackedColumnParameter(_ColumnvLLMParameter):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class PackedvLLMParameter(ModelWeightParameter):
@@ -323,10 +330,12 @@ class PackedvLLMParameter(ModelWeightParameter):
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -341,12 +350,17 @@ class PackedvLLMParameter(ModelWeightParameter):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
@@ -421,8 +435,13 @@ def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset,
+                                      bitblas_tile_size):
+    return shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size
+
+
 def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
-                                      marlin_tile_size):
+                                      marlin_tile_size, bitblas_tile_size):
     shard_size = shard_size // packed_factor
     shard_offset = shard_offset // packed_factor
     if marlin_tile_size is not None:
@@ -430,4 +449,10 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
             shard_size=shard_size,
             shard_offset=shard_offset,
             marlin_tile_size=marlin_tile_size)
-    return shard_size, shard_offset
+    elif bitblas_tile_size is not None:
+        return _adjust_shard_indexes_for_bitblas(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            bitblas_tile_size=bitblas_tile_size)
+
+    return shard_size, shard_offset
\ No newline at end of file
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0a580a4e907de..d76c75d9e6ce8 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -191,7 +191,7 @@ class SamplingMetadata:
             "SamplingMetadata("
             f"seq_groups={self.seq_groups}, "
             f"selected_token_indices={self.selected_token_indices}, "
-            f"categorized_sample_indices={self.categorized_sample_indices}), ")
+            f"categorized_sample_indices={self.categorized_sample_indices})")
 
 
 def _prepare_seq_groups(
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index f379ec1682a3c..70a912c9c9ef8 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import base64
 from io import BytesIO
 from pathlib import Path
+from typing import Literal, Optional
 
 import numpy as np
 import numpy.typing as npt
@@ -43,7 +43,7 @@ class AudioPlugin(MultiModalPlugin):
             "There is no default maximum multimodal tokens")
 
 
-def resample_audio(
+def resample_audio_librosa(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
@@ -52,6 +52,55 @@ def resample_audio(
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
 
 
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+
+
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+
+
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5159b0bca8c1c..ad95b982499c9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ class MultiModalPlaceholderMap:
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"],
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 255fac30bd78a..0c5a84c6508a1 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -149,7 +149,7 @@ class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
         return self.load_bytes(base64.b64decode(data))
 
     def load_file(self, filepath: Path) -> torch.Tensor:
-        return torch.load(filepath)
+        return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return base64.b64encode(media.numpy()).decode('utf-8')
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3a588bb4eaba1..6855808e8e44a 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,7 +109,8 @@ The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
-class PlaceholderRange(TypedDict):
+@dataclass(frozen=True)
+class PlaceholderRange:
     """
     Placeholder location information for multi-modal data.
 
@@ -121,8 +122,8 @@ class PlaceholderRange(TypedDict):
 
         .. code-block::
 
-            A: { "offset": 0, "length": 4 }
-            B: { "offset": 5, "length": 4 }
+            A: PlaceholderRange(offset=0, length=4)
+            B: PlaceholderRange(offset=5, length=4)
     """
 
     offset: int
@@ -131,6 +132,31 @@ class PlaceholderRange(TypedDict):
     length: int
     """The length of the placeholder."""
 
+    is_embed: Optional[torch.Tensor] = None
+    """
+    A boolean mask of shape `(length,)` indicating which positions
+    between `offset` and `offset + length` to assign embeddings to.
+    """
+
+    def get_num_embeds(self) -> int:
+        if self.is_embed is None:
+            return self.length
+
+        return int(self.is_embed.sum().item())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if not (self.offset, self.length) == (other.offset, other.length):
+            return False
+
+        if self.is_embed is None:
+            return other.is_embed is None
+        if other.is_embed is None:
+            return self.is_embed is None
+
+        return nested_tensors_equal(self.is_embed, other.is_embed)
+
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
@@ -294,7 +320,8 @@ class MultiModalFlatField(BaseMultiModalField):
         :func:`MultiModalFieldConfig.flat`
         :func:`MultiModalFieldConfig.flat_from_sizes`
     """
-    slices: Sequence[slice]
+    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+    dim: int = 0
 
     def build_elems(
         self,
@@ -303,7 +330,10 @@ class MultiModalFlatField(BaseMultiModalField):
         data: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
         field_factory = self._field_factory(modality=modality, key=key)
-        return [field_factory(data[s]) for s in self.slices]
+        if not is_list_of(self.slices, slice, check="all"):
+            assert isinstance(data, torch.Tensor), \
+                "torch.Tensor is required for multiple slices"
+        return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -312,10 +342,16 @@ class MultiModalFlatField(BaseMultiModalField):
                 # - produce exactly same result as `torch.concat(batch)`
                 # - will achieve zero-copy if the tensor is contiguous
                 return batch[0].contiguous()
-            first_shape = batch[0].shape
-            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
-                return torch.concat(batch)
 
+            def _expect_same_shape(tensor: torch.Tensor):
+                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+
+            first_shape = _expect_same_shape(batch[0])
+
+            if all(_expect_same_shape(elem) == first_shape for elem in batch):
+                return torch.concat(batch, dim=self.dim)
+
+        assert self.dim == 0, "dim == 0 is required for nested list"
         return [e for elem in batch for e in elem]
 
 
@@ -372,7 +408,9 @@ class MultiModalFieldConfig:
         )
 
     @staticmethod
-    def flat(modality: str, slices: Sequence[slice]):
+    def flat(modality: str,
+             slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
+             dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -380,8 +418,10 @@ class MultiModalFieldConfig:
         Args:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
-            slices: For each multi-modal item, a slice that is used to extract
-                the data corresponding to it.
+            slices: For each multi-modal item, a slice (dim=0) or a tuple of
+                slices (dim>0) that is used to extract the data corresponding 
+                to it.
+            dim: The dimension to extract data, default to 0.
 
         Example:
 
@@ -397,14 +437,33 @@ class MultiModalFieldConfig:
                     Element 1: [AAA]
                     Element 2: [BBBB]
                     Element 3: [CC]
+            
+            .. code-block::
+
+                Given:
+                    slices: [
+                        (slice(None), slice(0, 3)),
+                        (slice(None), slice(3, 7)),
+                        (slice(None), slice(7, 9))]
+                    dim: 1
+
+                Input:
+                    Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+                Output:
+                    Element 1: [[A],[A],[A]]
+                    Element 2: [[B],[B],[B],[B]]
+                    Element 3: [[C],[C]]
         """
         return MultiModalFieldConfig(
-            field=MultiModalFlatField(slices=slices),
+            field=MultiModalFlatField(slices=slices, dim=dim),
             modality=modality,
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+    def flat_from_sizes(modality: str,
+                        size_per_item: torch.Tensor,
+                        dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -414,6 +473,7 @@ class MultiModalFieldConfig:
                 keyword argument.
             slices: For each multi-modal item, the size of the slice that
                 is used to extract the data corresponding to it.
+            dim: The dimension to slice, default to 0.
 
         Example:
 
@@ -429,6 +489,21 @@ class MultiModalFieldConfig:
                     Element 1: [AAA]
                     Element 2: [BBBB]
                     Element 3: [CC]
+
+            
+            .. code-block::
+
+                Given:
+                    slices: [3, 4, 2]
+                    dim: 1
+
+                Input:
+                    Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+                Output:
+                    Element 1: [[A],[A],[A]]
+                    Element 2: [[B],[B],[B],[B]]
+                    Element 3: [[C],[C]]
     
         See also:
             :func:`MultiModalFieldConfig.flat`
@@ -439,12 +514,11 @@ class MultiModalFieldConfig:
                              f"but found shape: {size_per_item.shape}")
 
         slice_idxs = [0, *accumulate(size_per_item)]
-        slices = [
-            slice(slice_idxs[i], slice_idxs[i + 1])
-            for i in range(len(size_per_item))
-        ]
+        slices = [(slice(None, None, None), ) * dim +
+                  (slice(slice_idxs[i], slice_idxs[i + 1]), )
+                  for i in range(len(size_per_item))]
 
-        return MultiModalFieldConfig.flat(modality, slices)
+        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
 
     @staticmethod
     def shared(modality: str, batch_size: int):
@@ -743,7 +817,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
+    mm_hashes: Optional["MultiModalHashDict"]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 772b1609a9fbb..9707b9cfcf8bf 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -3,8 +3,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -14,7 +14,7 @@ from typing_extensions import TypeAlias, TypeGuard, assert_never
 
 from vllm.utils import is_list_of
 
-from .audio import resample_audio
+from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
                      MultiModalFieldConfig, MultiModalKwargs, VideoItem)
@@ -295,7 +295,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
 
 
 ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
-                                         ModalityDataItems[Any, Any]]
+                                         Optional[ModalityDataItems[Any, Any]]]
 
 
 class MultiModalDataParser:
@@ -308,10 +308,18 @@ class MultiModalDataParser:
             items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        target_sr: Optional[float] = None,
+        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+    ) -> None:
         super().__init__()
 
-        self.target_sr = target_sr
+        self.audio_resampler = AudioResampler(
+            target_sr=target_sr,
+            method=audio_resample_method,
+        )
 
     def _is_embeddings(
             self, data: object
@@ -319,7 +327,15 @@ class MultiModalDataParser:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            return len(data) == 0 or data[0].ndim == 2
+            return data[0].ndim == 2
+
+        return False
+
+    def _is_empty(self, data: object) -> TypeGuard[None]:
+        if isinstance(data, list):
+            return len(data) == 0
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            return data.size == 0
 
         return False
 
@@ -341,7 +357,12 @@ class MultiModalDataParser:
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        # also check single audio item with sampling rate
+        if self._is_empty(data) or (isinstance(data, tuple)
+                                    and self._is_empty(data[0])):
+            return None
+
         if self._is_embeddings(data):
             return AudioEmbeddingItems(data)
 
@@ -361,15 +382,8 @@ class MultiModalDataParser:
             if orig_sr is None:
                 new_audio = audio
             else:
-                target_sr = self.target_sr
-                if target_sr is None:
-                    raise RuntimeError(
-                        "Audio resampling is not supported when "
-                        "`target_sr` is not provided")
-
-                new_audio = resample_audio(audio,
-                                           orig_sr=orig_sr,
-                                           target_sr=target_sr)
+                new_audio = self.audio_resampler.resample(audio,
+                                                          orig_sr=orig_sr)
 
             new_audios.append(new_audio)
 
@@ -378,7 +392,10 @@ class MultiModalDataParser:
     def _parse_image_data(
         self,
         data: ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return ImageEmbeddingItems(data)
 
@@ -396,7 +413,10 @@ class MultiModalDataParser:
     def _parse_video_data(
         self,
         data: ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return VideoEmbeddingItems(data)
 
@@ -427,6 +447,8 @@ class MultiModalDataParser:
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            # ignore empty embedding data
+            if (parsed_data := subparsers[k](v)) is not None:
+                mm_items[k] = parsed_data
 
         return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fec77acc1d197..16358d1a5ee4c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -12,7 +13,6 @@ from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
                     TypeVar, Union, cast)
 
 import torch
-from cachetools import LRUCache
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
@@ -21,7 +21,7 @@ from vllm.jsontree import json_map_leaves, json_reduce_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
+from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
@@ -109,16 +109,46 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    features: _S
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    The part of the content that corresponds to feature placeholders;
-    this will be replaced by the output of the vision encoder during model
-    inference.
+    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    indicating which positions of `full` to assign embeddings to.
+
+    `None` (default) means to assign embeddings to all positions of `full`.
+
+    The embeddings are obtained by calling
+    :class:`SupportsMultiModal.get_multimodal_embeddings`.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq, features=seq)
+        return PromptUpdateDetails(full=seq)
+
+    @staticmethod
+    def select_text(
+        seq: _S,
+        embed_text: str,
+    ) -> "PromptUpdateDetails[_S]":
+
+        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
+            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+
+            return torch.isin(
+                torch.tensor(full.token_ids),
+                torch.tensor(embed_token_ids),
+            )
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
+
+    @staticmethod
+    def select_token_id(
+        seq: _S,
+        embed_token_id: int,
+    ) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(
+            full=seq,
+            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
+        )
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -407,7 +437,7 @@ class _BoundPromptSequence:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    features: _BoundPromptSequence
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
 
 
 @dataclass
@@ -467,10 +497,8 @@ class BoundPromptUpdate:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
-        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            features=bound_features)
+                                            is_embed=content.is_embed)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -606,15 +634,19 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
+    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
+        # TODO: Is it worth it to optimize this by stripping the
+        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
+            is_embed=self.is_embed,
         )
 
 
@@ -807,22 +839,17 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_tokens_feat = content.features.token_ids
+                    content_is_embed = content.is_embed
+                    if content_is_embed is not None:
+                        content_is_embed = content_is_embed(content.full)
 
-                    try:
-                        match = next(
-                            iter_token_matches(content_tokens_full,
-                                               content_tokens_feat))
-                        yield PlaceholderFeaturesInfo(
-                            modality=modality,
-                            item_idx=item_idx,
-                            start_idx=start_idx + match.start_idx,
-                            tokens=content_tokens_feat,
-                        )
-                    except StopIteration:
-                        raise AssertionError(
-                            f"{content_tokens_feat=} should be a "
-                            f"subsequence of {content_tokens_full=}") from None
+                    yield PlaceholderFeaturesInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        tokens=content_tokens_full,
+                        is_embed=content_is_embed,
+                    )
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
@@ -1008,20 +1035,19 @@ class BaseProcessingInfo:
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
+    def get_allowed_mm_limits(self) -> Mapping[str, int]:
+        """Return the maximum allowed number of items for each modality."""
+        supported_mm_limits = self.get_supported_mm_limits()
+        mm_config = self.ctx.get_mm_config()
 
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
+        allowed_limits = dict[str, int]()
+        for modality, supported_limit in supported_mm_limits.items():
+            user_limit = mm_config.get_limit_per_prompt(modality)
+
+            allowed_limits[modality] = (user_limit if supported_limit is None
+                                        else min(user_limit, supported_limit))
+
+        return allowed_limits
 
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
@@ -1040,12 +1066,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
-        if get_repls := getattr(self, "_get_prompt_replacements", None):
-            logger.warning_once("`_get_prompt_replacements` has been renamed "
-                                "to `_get_prompt_updates`. The old name will "
-                                "be removed in an upcoming release.")
-            self._get_prompt_updates = get_repls  # type: ignore[method-assign]
-
         super().__init__()
 
         self.info = info
@@ -1082,14 +1102,25 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         before passing them to :meth:`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
-        mm_config = self.info.ctx.get_mm_config()
+        supported_mm_limits = self.info.get_supported_mm_limits()
+        allowed_mm_limits = self.info.get_allowed_mm_limits()
 
         for modality, items in mm_items.items():
-            limit = mm_config.get_limit_per_prompt(modality)
-            if len(items) > limit:
+            supported_limit = supported_mm_limits.get(modality, 0)
+            allowed_limit = allowed_mm_limits.get(modality, 0)
+            num_items = len(items)
+
+            if supported_limit is not None and num_items > supported_limit:
                 raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
+                    f"The model only supports at most {supported_limit} "
+                    f"{modality} items, but you passed {num_items} "
+                    f"{modality} items in the same prompt.")
+
+            if num_items > allowed_limit:
+                raise ValueError(
+                    "You set or defaulted to "
+                    f"'{json.dumps({modality: allowed_limit})}' in "
+                    f"`--limit-mm-per-prompt`, but passed {num_items} "
                     f"{modality} items in the same prompt.")
 
         return mm_items
@@ -1263,13 +1294,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         mm_counts = mm_items.get_all_counts()
 
-        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
-            self.info.ctx.model_config.max_model_len,
-            mm_counts,
-        )
-
         _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
-            prompt_text=dummy_inputs.prompt_text,
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
@@ -1655,6 +1681,10 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         """
         raise NotImplementedError
 
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return False
+
     def create_decoder_prompt(
         self,
         prompt: Union[str, list[int]],
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7b4fb5eb598d1..a173487c470c8 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,21 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from abc import ABC, abstractmethod
+from abc import ABC
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, TypeVar, cast
+from typing import Generic, NamedTuple, Optional, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs)
-from .processing import BaseMultiModalProcessor, BaseProcessingInfo
+                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalPlaceholderDict)
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         EncDecMultiModalProcessor)
 
 logger = init_logger(__name__)
 
@@ -31,6 +32,20 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
+class DummyEncoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+
+
+class DummyDecoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+    multi_modal_data: MultiModalKwargs
+    multi_modal_placeholders: MultiModalPlaceholderDict
+
+
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
 
@@ -45,7 +60,35 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
 
         self.info = info
 
-    @abstractmethod
+    # TODO: @abstractmethod after transition
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        """
+        Build the text input corresponding to :code:`mm_counts`.
+        """
+        if (type(self).get_dummy_processor_inputs ==
+                BaseDummyInputsBuilder.get_dummy_processor_inputs):
+            raise NotImplementedError
+
+        logger.warning_once("`get_dummy_processor_inputs` has been split up "
+                            "into `get_dummy_text` and `get_dummy_mm_data`. "
+                            "These two methods will be marked as abstract "
+                            "in an upcoming release.")
+
+        seq_len = self.info.ctx.model_config.max_model_len
+        return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text
+
+    # TODO: @abstractmethod after transition
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        """
+        Build the multimodal input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        raise NotImplementedError
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -53,9 +96,12 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        the maximum possible number of placeholder tokens.
         """
-        raise NotImplementedError
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+
+        return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data)
 
     def _get_dummy_audios(
         self,
@@ -63,6 +109,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         length: int,
         num_audios: int,
     ) -> list[npt.NDArray]:
+        if num_audios == 0:
+            return []
         audio = np.zeros((length, ))
         return [audio] * num_audios
 
@@ -73,6 +121,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         height: int,
         num_images: int,
     ) -> list[Image.Image]:
+        if num_images == 0:
+            return []
         image = Image.new("RGB", (width, height), color=255)
         return [image] * num_images
 
@@ -84,6 +134,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         num_frames: int,
         num_videos: int,
     ) -> list[npt.NDArray]:
+        if num_videos == 0:
+            return []
         video = np.full((num_frames, width, height, 3), 255)
         return [video] * num_videos
 
@@ -110,29 +162,16 @@ class MultiModalProfiler(Generic[_I]):
         return self.processor.dummy_inputs
 
     def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.processing_info.ctx.get_mm_config()
-        supported_mm_limits = self.processing_info.get_supported_mm_limits()
-
-        mm_limits = {
-            modality: mm_config.get_limit_per_prompt(modality)
-            for modality in supported_mm_limits
-        }
-
-        for modality, supported_limit in supported_mm_limits.items():
-            limit = mm_limits[modality]
-            if supported_limit is not None and supported_limit < limit:
-                raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but this model only supports "
-                    f"at most {supported_limit} {modality} items.")
-
-        return mm_limits
+        return self.processing_info.get_allowed_mm_limits()
 
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
-        mm_counts: Mapping[str, int],
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> MultiModalInputs:
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+
         factory = self.dummy_inputs
         processor_inputs = factory.get_dummy_processor_inputs(
             seq_len, mm_counts)
@@ -143,50 +182,23 @@ class MultiModalProfiler(Generic[_I]):
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
 
-    def get_and_validate_mm_inputs(
+    def _get_mm_num_tokens(
         self,
-        seq_len: int,
-    ) -> tuple[MultiModalInputs, Mapping[str, int]]:
-        mm_counts = self.get_mm_limits()
-
-        info = self.processing_info
-        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
-            seq_len, mm_counts)
-
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits` "
-                f"({set(mm_counts.keys())}) should be the same as those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs: MultiModalInputs,
+    ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
-        total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
+        return {
+            modality: sum(item.get_num_embeds() for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-        return mm_inputs, total_placeholders_by_modality
 
     def get_encoder_dummy_data(
         self,
         seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -194,24 +206,35 @@ class MultiModalProfiler(Generic[_I]):
         encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
 
         total_len = len(encoder_prompt_token_ids)
-        num_tokens_to_pad = max(total_len, seq_len) - total_len
-        encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
 
-        return DummyData(
-            seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
-            multi_modal_data=None,
-            multi_modal_placeholders=None,
-        )
+        processor = cast(EncDecMultiModalProcessor, self.processor)
+        if processor.pad_dummy_encoder_prompt:
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+        # NOTE: Whisper allows total_len > seq_len.
+        elif total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning_once(
+                "The encoder sequence length used for profiling ("
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                " is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                f"({total_len} tokens in total, out of which "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.")
+
+        return DummyEncoderData(encoder_prompt_token_ids)
 
     def get_decoder_dummy_data(
         self,
         seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        (mm_inputs, total_placeholders_by_modality
-         ) = self.get_and_validate_mm_inputs(seq_len)
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
@@ -219,28 +242,33 @@ class MultiModalProfiler(Generic[_I]):
         # V0 does not support chunked prefill.
         if total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning(
+            logger.warning_once(
                 "The sequence length used for profiling ("
-                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                "is too short "
                 "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
+                f"({total_len} tokens in total, out of which "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
                 "multi-modal embeddings). This may cause certain "
                 "multi-modal inputs to fail during inference, even when "
                 "the input text is short. To avoid this, you should "
                 "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.", seq_len, total_len,
-                total_placeholders_by_modality)
+                "and/or reduce `mm_counts`.")
 
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
+        if total_len < seq_len:
+            prompt_token_ids.extend([0] * (seq_len - total_len))
 
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
+        return DummyDecoderData(
+            prompt_token_ids=prompt_token_ids,
             multi_modal_data=mm_inputs["mm_kwargs"],
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
+
+    def get_mm_max_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> Mapping[str, int]:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+
+        return self._get_mm_num_tokens(mm_inputs)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 24b8358982797..5c687e49d22b0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+import json
 from collections import UserDict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
@@ -21,7 +22,8 @@ from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
-from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
+from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
+                        DummyEncoderData, MultiModalProfiler)
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -193,9 +195,9 @@ class MultiModalRegistry:
             max_items = self._limits_by_model[model_config][data_key]
             if num_items > max_items:
                 raise ValueError(
-                    f"You set {data_key}={max_items} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but found {num_items} items "
-                    "in the same prompt.")
+                    f"You set '{json.dumps({data_key: max_items})}' (or "
+                    "defaulted to 1) in `--limit-mm-per-prompt`, but found "
+                    f"{num_items} items in the same prompt.")
 
             input_dict = plugin.map_input(model_config, data_value,
                                           mm_processor_kwargs)
@@ -256,14 +258,19 @@ class MultiModalRegistry:
         on underlying model configuration.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
+            profiler = MultiModalProfiler(processor)
+
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
-            return processor.info.get_mm_max_tokens_per_item(
-                seq_len, mm_limits)
+
+            return profiler.get_mm_max_tokens(
+                seq_len,
+                {
+                    modality: 1
+                    for modality, limit in mm_limits.items() if limit > 0
+                },
+            )
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
@@ -373,10 +380,7 @@ class MultiModalRegistry:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -436,8 +440,8 @@ class MultiModalRegistry:
     def create_processor(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer,
         *,
+        tokenizer: Optional[AnyTokenizer] = None,
         disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
@@ -446,6 +450,8 @@ class MultiModalRegistry:
         See also:
             :ref:`mm-processing`
         """
+        if tokenizer is None:
+            tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
             disable_cache = model_config.disable_mm_preprocessor_cache
 
@@ -456,3 +462,51 @@ class MultiModalRegistry:
         cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)
+
+    def get_decoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            raise AssertionError(
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
+
+    def get_encoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            logger.warning_once(
+                f"Expected at least {seq_len} dummy encoder tokens for "
+                f"profiling, but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ad381e1d1d00d..3f9b5be28b02b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -12,8 +12,6 @@ from PIL import Image
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
-from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -21,8 +19,6 @@ from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
-logger = init_logger(__name__)
-
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
@@ -268,7 +264,7 @@ fetch_video = global_media_connector.fetch_video
 
 def encode_audio_base64(
     audio: np.ndarray,
-    sampling_rate: int,
+    sampling_rate: float,
 ) -> str:
     """Encode audio as base64."""
     audio_io = AudioMediaIO()
@@ -296,121 +292,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-# Utilities for input processors
-_T = TypeVar("_T", str, int)
-
-
-def repeat_and_pad_token(
-    token: _T,
-    *,
-    repeat_count: int = 1,
-    pad_token_left: Optional[_T] = None,
-    pad_token_right: Optional[_T] = None,
-) -> list[_T]:
-    replacement = [token] * repeat_count
-    if pad_token_left is not None:
-        replacement = [pad_token_left] + replacement
-    if pad_token_right is not None:
-        replacement = replacement + [pad_token_right]
-
-    return replacement
-
-
-def repeat_and_pad_placeholder_tokens(
-    tokenizer: AnyTokenizer,
-    prompt: Optional[str],
-    prompt_token_ids: list[int],
-    *,
-    placeholder_token_id: int,
-    repeat_count: Union[int, list[int]],
-    pad_token_left: Optional[int] = None,
-    pad_token_right: Optional[int] = None,
-) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
-    if isinstance(repeat_count, int):
-        repeat_count = [repeat_count]
-
-    if prompt is None:
-        new_prompt = None
-    else:
-        placeholder_token_str = tokenizer.decode(placeholder_token_id)
-        pad_token_str_left = (None if pad_token_left is None else
-                              tokenizer.decode(pad_token_left))
-        pad_token_str_right = (None if pad_token_right is None else
-                               tokenizer.decode(pad_token_right))
-
-        placeholder_token_count = prompt.count(placeholder_token_str)
-        # This is an arbitrary number to distinguish between the two cases
-        if placeholder_token_count > 16:
-            logger.warning(
-                "Please follow the prompt format that is "
-                "documented on HuggingFace which does not involve "
-                "repeating %s tokens.", placeholder_token_str)
-        if placeholder_token_count < len(repeat_count):
-            logger.warning(
-                "The number of multi-modal placeholder tokens in the prompt "
-                "is less than the number of multi-modal inputs. Extra "
-                "placeholder tokens will be treated as plain text")
-            repeat_count = repeat_count[:placeholder_token_count]
-
-        prompt_parts = prompt.split(placeholder_token_str,
-                                    maxsplit=len(repeat_count))
-        new_prompt = ""
-        for i, repeat_count_item in enumerate(repeat_count):
-            replacement_str = "".join(
-                repeat_and_pad_token(
-                    placeholder_token_str,
-                    repeat_count=repeat_count_item,
-                    pad_token_left=pad_token_str_left,
-                    pad_token_right=pad_token_str_right,
-                ))
-            # The image tokens are removed to be consistent with HuggingFace
-            new_prompt += prompt_parts[i] + replacement_str
-        new_prompt += prompt_parts[-1]
-
-    new_token_ids = list[int]()
-    placeholder_ranges = list[PlaceholderRange]()
-    placeholder_token_idx = 0
-    for i, token in enumerate(prompt_token_ids):
-        if token == placeholder_token_id:
-            curr_repeat_count = repeat_count[placeholder_token_idx]
-            replacement_ids = repeat_and_pad_token(
-                placeholder_token_id,
-                repeat_count=curr_repeat_count,
-                pad_token_left=pad_token_left,
-                pad_token_right=pad_token_right,
-            )
-            offset = len(new_token_ids)
-            if pad_token_left is not None:
-                offset += 1
-            placeholder_ranges.append({
-                "offset": offset,
-                "length": curr_repeat_count,
-            })
-            new_token_ids.extend(replacement_ids)
-            placeholder_token_idx += 1
-
-            # No need to further scan the list since we replaced all tokens
-            if placeholder_token_idx >= len(repeat_count):
-                new_token_ids.extend(prompt_token_ids[i + 1:])
-                break
-        else:
-            new_token_ids.append(token)
-
-    return new_prompt, new_token_ids, placeholder_ranges
-
-
-def consecutive_placeholder_ranges(
-        num_items: int,
-        item_size: int,
-        initial_offset: int = 0) -> list[PlaceholderRange]:
-    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
-
-    return [
-        PlaceholderRange(offset=initial_offset + i * item_size,
-                         length=item_size) for i in range(num_items)
-    ]
-
-
 def merge_and_sort_multimodal_metadata(
     mm_positions: "MultiModalPlaceholderDict",
     mm_hashes: Optional["MultiModalHashDict"],
@@ -422,14 +303,10 @@ def merge_and_sort_multimodal_metadata(
 
     Optionally if a MultiModalHashDict is given, same operation will be 
     applied to the object and the sorted list of hashes will be returned.
-
-    Raises:
-        ValueError: If the input prompt has interleaved placeholders from
-            different modalities (e.g, "<image><audio><image> Describe the 
-            content.")
     
     Returns:
-        list[str]: Sorted list of involved modalities.
+        list[str]: List of item modalities in order of their positions in
+            the input sequence.
         list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
             mm_positions.
         Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
@@ -443,50 +320,33 @@ def merge_and_sort_multimodal_metadata(
     # For single modality, placeholder ranges and hashes are already sorted
     # so we can return the list directly.
     if len(modalities) == 1:
-        if mm_hashes is None:
-            return modalities, list(mm_positions[modalities[0]]), None
-        else:
-            return modalities, list(mm_positions[modalities[0]]), list(
-                mm_hashes[modalities[0]])
+        modality = modalities[0]
+        placeholder_list = list(mm_positions[modality])
 
-    placeholder_lists_with_modality = [(modality, mm_positions[modality])
-                                       for modality in modalities]
+        return [modality] * len(
+            placeholder_list
+        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
 
-    if mm_hashes is None:
-        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
-                                          key=lambda x: x[1][0]['offset'])
-        sorted_hash_lists = None
-    else:
-        hashes_lists = [
-            mm_hashes[modality] for modality in modalities
-            if modality in mm_hashes
-        ]
-        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
-                                  hashes_lists),
-                              key=lambda x: x[0][1][0]['offset'])
-        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
-        sorted_placeholder_lists = list(sorted_placeholder_tuple)
-        sorted_hash_lists = list(sorted_hash_tuple)
+    # Create a list of (modality, placeholder, hash) tuples for all placeholders
+    all_items = []
+    for modality in modalities:
+        placeholder_list = list(mm_positions[modality])
+        hash_list: list[Optional[str]] = list(
+            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
+                None
+            ] * len(placeholder_list)
 
-    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+        for placeholder, hash_value in zip(placeholder_list, hash_list):
+            all_items.append((modality, placeholder, hash_value))
 
-    # Flatten sorted list of lists to a single list and verify there is no
-    # interleaving of placeholders from different modalities.
-    merged_placeholders: list[PlaceholderRange] = []
-    for modality, placeholder_list in sorted_placeholder_lists:
-        if merged_placeholders and placeholder_list[0][
-                'offset'] < merged_placeholders[-1]['offset']:
-            raise ValueError(
-                "Interleaved mixed-modality inference is currently not "
-                "supported.")
-        merged_placeholders.extend(placeholder_list)
+    # Sort all items by offset
+    all_items.sort(key=lambda x: x[1].offset)
 
-    if sorted_hash_lists is not None:
-        merged_hashes = []
-        for hash_list in sorted_hash_lists:
-            merged_hashes.extend(hash_list)
-    else:
-        merged_hashes = None
+    # Split into separate lists
+    sorted_modalities = [item[0] for item in all_items]
+    merged_placeholders = [item[1] for item in all_items]
+    merged_hashes = [str(item[2])
+                     for item in all_items] if mm_hashes is not None else None
 
     return sorted_modalities, merged_placeholders, merged_hashes
 
@@ -502,8 +362,7 @@ def group_mm_inputs_by_modality(
 
     Returns:
         list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
-        inner list contains consecutive MultiModalKwargs with same modality, or
-        one with multimodal modalities.
+        inner list contains consecutive MultiModalKwargs with same modality.
     """
     if not mm_inputs:
         return []
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 0b3d3f8c79d72..f7c3f10529542 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -13,7 +13,7 @@ from PIL import Image
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_get_video_processor
-from vllm.utils import PlaceholderModule, is_list_of
+from vllm.utils import is_list_of
 
 from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
@@ -22,11 +22,6 @@ from .inputs import MultiModalKwargs, VideoItem
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
 logger = init_logger(__name__)
 
 
@@ -117,6 +112,69 @@ def sample_frames_from_video(frames: npt.NDArray,
     return sampled_frames
 
 
+class VideoLoader:
+
+    @classmethod
+    def load_bytes(self, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        raise NotImplementedError
+
+
+class OpenCVVideoBackend(VideoLoader):
+
+    def get_cv2_video_api(self):
+        import cv2.videoio_registry as vr
+
+        api_pref = None
+        for backend in vr.getStreamBufferedBackends():
+            if not vr.hasBackend(backend):
+                continue
+            if not vr.isBackendBuiltIn(backend):
+                _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
+                if (abi < 1 or (abi == 1 and api < 2)):
+                    continue
+            api_pref = backend
+            break
+        return api_pref
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        full_read = num_frames == -1 or total_frames_num < num_frames
+        if full_read:
+            frame_idx = list(range(0, total_frames_num))
+        else:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frames_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()  # next img
+            if not ok:
+                break
+            if idx in frame_idx:  # only decompress needed
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+        # we expect all frames loaded
+        assert i == num_frames
+        return frames
+
+
 class VideoMediaIO(MediaIO[npt.NDArray]):
 
     def __init__(
@@ -129,22 +187,10 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
 
         self.image_io = image_io
         self.num_frames = num_frames
+        self.video_loader = OpenCVVideoBackend
 
     def load_bytes(self, data: bytes) -> npt.NDArray:
-        vr = decord.VideoReader(BytesIO(data), num_threads=1)
-        total_frame_num = len(vr)
-
-        num_frames = self.num_frames
-        if total_frame_num > num_frames:
-            uniform_sampled_frames = np.linspace(0,
-                                                 total_frame_num - 1,
-                                                 num_frames,
-                                                 dtype=int)
-            frame_idx = uniform_sampled_frames.tolist()
-        else:
-            frame_idx = list(range(0, total_frame_num))
-
-        return vr.get_batch(frame_idx).asnumpy()
+        return self.video_loader.load_bytes(data, self.num_frames)
 
     def load_base64(self, media_type: str, data: str) -> npt.NDArray:
         if media_type.lower() == "video/jpeg":
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 014e8d5d88238..c8b9be5424e4b 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -134,26 +134,32 @@ class RequestOutput:
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-    def add(self, next_output: "RequestOutput") -> None:
+    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
         """Merge subsequent RequestOutput into this one"""
 
         self.finished |= next_output.finished
 
         for next_completion in next_output.outputs:
-            for completion in self.outputs:
+            for i, completion in enumerate(self.outputs):
                 if completion.index == next_completion.index:
-                    # Merge outputs with same index
-                    completion.text += next_completion.text
-                    if not isinstance(completion.token_ids, MutableSequence):
-                        completion.token_ids = list(completion.token_ids)
-                    completion.token_ids.extend(next_completion.token_ids)
-                    if next_completion.logprobs:
-                        assert completion.logprobs is not None
-                        completion.logprobs.extend(next_completion.logprobs)
-                    completion.cumulative_logprob = (
-                        next_completion.cumulative_logprob)
-                    completion.finish_reason = next_completion.finish_reason
-                    completion.stop_reason = next_completion.stop_reason
+                    if aggregate:
+                        # Merge outputs with same index
+                        completion.text += next_completion.text
+                        if not isinstance(completion.token_ids,
+                                          MutableSequence):
+                            completion.token_ids = list(completion.token_ids)
+                        completion.token_ids.extend(next_completion.token_ids)
+                        if next_completion.logprobs:
+                            assert completion.logprobs is not None
+                            completion.logprobs.extend(
+                                next_completion.logprobs)
+                        completion.cumulative_logprob = (
+                            next_completion.cumulative_logprob)
+                        completion.finish_reason = next_completion.finish_reason
+                        completion.stop_reason = next_completion.stop_reason
+                    else:
+                        # Replace the output with the new one
+                        self.outputs[i] = next_completion
                     break
             else:
                 self.outputs.append(next_completion)
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0eb747a4c4514..70553354a0602 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import sys
+from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -67,8 +69,15 @@ class CpuPlatform(Platform):
 
         cache_config = vllm_config.cache_config
 
+        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+
         if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
+            cache_config.block_size = 128 if ipex_available else 16
+
+        if not ipex_available and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
 
         scheduler_config = vllm_config.scheduler_config
         if ((scheduler_config.chunked_prefill_enabled
@@ -148,6 +157,13 @@ class CpuPlatform(Platform):
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
+        if sys.platform == "darwin" and \
+                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
+            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
+                logger.warning(
+                    "Default to spawn method on MacOS. If this is not desired,"
+                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
+                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ca8a2d2640ec5..452c138277618 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -101,7 +101,7 @@ class CudaPlatformBase(Platform):
         return True
 
     @classmethod
-    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, device_ids: List[int]) -> bool:
         raise NotImplementedError
 
     @classmethod
@@ -213,6 +213,9 @@ class CudaPlatformBase(Platform):
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
         if use_v1:
+            if selected_backend == _Backend.FLASHINFER:
+                logger.info_once("Using FlashInfer backend on V1 engine.")
+                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
@@ -308,6 +311,10 @@ class CudaPlatformBase(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         return True
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
@@ -362,7 +369,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
     @with_nvml_context
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         """
         query if the set of gpus are fully connected by nvlink (1 hop)
         """
@@ -427,7 +434,7 @@ class NonNvmlCudaPlatform(CudaPlatformBase):
         return device_props.total_memory
 
     @classmethod
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         logger.exception(
             "NVLink detection not possible, as context support was"
             " not found. Assuming no NVLink available.")
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 4c842b5251105..456b054b2b43a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -46,15 +46,15 @@ class HpuPlatform(Platform):
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         scheduler_config = vllm_config.scheduler_config
+        parallel_config = vllm_config.parallel_config
         if scheduler_config.is_multi_step:
-            raise NotImplementedError(
-                "Multi-step execution is not implemented for HPU")
+            parallel_config.worker_cls = \
+                "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
 
         if vllm_config.speculative_config is not None:
             raise NotImplementedError(
                 "Speculative decoding is not implemented for HPU")
 
-        parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 36db70681a199..a60e128b550ff 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import enum
 import platform
 import random
@@ -9,14 +8,21 @@ from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 import numpy as np
 import torch
 
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
     from vllm.utils import FlexibleArgumentParser
 else:
     ModelConfig = None
     VllmConfig = None
+    LoRARequest = None
+    PoolingParams = None
+    SamplingParams = None
     FlexibleArgumentParser = None
 
 logger = init_logger(__name__)
@@ -33,6 +39,7 @@ class _Backend(enum.Enum):
     TRITON_ATTN_VLLM_V1 = enum.auto()
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
+    ROCM_AITER_MLA = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
@@ -142,6 +149,9 @@ class Platform:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    def is_sleep_mode_available(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
@@ -231,7 +241,7 @@ class Platform:
                                 parser: Optional[FlexibleArgumentParser] = None
                                 ) -> None:
         """
-        Do some pre-registeration or update action for the current platform.
+        Do some pre-registration or update action for the current platform.
 
         This function is called before global VllmConfig is initialized or cli
         arguments are parsed. It's used for out-of-tree platforms to register or
@@ -379,6 +389,38 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+
+    def __getattr__(self, key: str):
+        device = getattr(torch, self.device_name, None)
+        if device is not None and hasattr(device, key):
+            return getattr(device, key)
+        else:
+            logger.warning("Current platform %s doesn't has '%s' attribute.",
+                           self.device_name, key)
+            return None
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        """
+        Returns the total number of compute units (CU) on single GPU.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index c1f426e5b8801..e37a3a578cf20 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -50,7 +50,7 @@ class NeuronPlatform(Platform):
         if cache_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len
+                vllm_config.model_config.max_model_len  # type: ignore
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d196e24ac7ac0..944879b94ecd5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from functools import lru_cache, wraps
+from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
@@ -20,8 +20,9 @@ else:
 logger = init_logger(__name__)
 
 try:
-    from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
-                        amdsmi_init, amdsmi_shut_down)
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
 except ImportError as e:
     logger.warning("Failed to import from amdsmi with %r", e)
 
@@ -97,6 +98,31 @@ def device_id_to_physical_device_id(device_id: int) -> int:
         return device_id
 
 
+@cache
+def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
+                                    block_size: int, gqa_ratio: int,
+                                    max_seq_len: int,
+                                    sliding_window: int) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_NAVI = "gfx1" in GPU_ARCH
+    ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+    # rocm custom page attention not support on navi (gfx1*)
+    # custom paged attn always supported on V0. On V1, requires sliding window
+    # disabled due to observed numerical discrepancy.
+    return (ON_MI250_MI300 and not ON_NAVI
+            and (not envs.VLLM_USE_V1 or sliding_window == 0
+                 or sliding_window == (-1, -1))
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                     and envs.VLLM_ROCM_USE_AITER))
+
+
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
     device_name: str = "rocm"
@@ -116,8 +142,36 @@ class RocmPlatform(Platform):
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_mla:
-            logger.info("Using Triton MLA backend.")
-            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            from vllm.attention.backends.rocm_aiter_mla import (
+                is_aiter_mla_enabled)
+
+            if selected_backend is None:
+                selected_backend = (_Backend.ROCM_AITER_MLA if
+                                    is_aiter_mla_enabled() or block_size == 1
+                                    else _Backend.TRITON_MLA)
+
+            if selected_backend == _Backend.TRITON_MLA:
+                if block_size != 1:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}.")
+            elif selected_backend == _Backend.ROCM_AITER_MLA:
+                if block_size == 1:
+                    logger.info("Using AITER MLA backend.")
+                    return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}."
+                        "(currently only supports block size 1)")
+            else:
+                raise ValueError(
+                    f" The selected backend, {selected_backend.name},"
+                    f"is not MLA type while requested for MLA backend.")
+
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if envs.VLLM_USE_V1:
@@ -135,10 +189,36 @@ class RocmPlatform(Platform):
 
     @classmethod
     @lru_cache(maxsize=8)
-    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
+    @staticmethod
+    @with_amdsmi_context
+    def is_fully_connected(physical_device_ids: List[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
     @classmethod
     @with_amdsmi_context
     @lru_cache(maxsize=8)
@@ -255,3 +335,15 @@ class RocmPlatform(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on AMD gpus is experimental
         return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94']
+        return any(gfx in gcn_arch for gfx in supported_archs)
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(
+            device_id).multi_processor_count
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 43d3044cb93ee..fcac5155637fe 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,19 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
 import vllm.envs as envs
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams, SamplingType
 
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
+    from vllm.pooling_params import PoolingParams
 else:
     ModelConfig = None
     VllmConfig = None
+    PoolingParams = None
 
 logger = init_logger(__name__)
 
@@ -93,6 +97,20 @@ class TpuPlatform(Platform):
                 "Using bfloat16 instead.", vllm_config.model_config.dtype)
             vllm_config.model_config.dtype = torch.bfloat16
 
+        if envs.VLLM_USE_V1:
+            from vllm.v1.attention.backends.pallas import (
+                PallasAttentionBackend)
+            min_page_size = PallasAttentionBackend.get_min_page_size(
+                vllm_config)
+            if min_page_size > vllm_config.cache_config.block_size:
+                logger.warning(
+                    "Increase the page size from %s to %s to make sure there's"
+                    "no SMEM OOM",
+                    vllm_config.cache_config.block_size,
+                    min_page_size,
+                )
+                vllm_config.cache_config.block_size = min_page_size
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
@@ -116,6 +134,13 @@ class TpuPlatform(Platform):
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
 
+        if scheduler_config.is_multimodal_model and not \
+            scheduler_config.disable_chunked_mm_input:
+            logger.warning("TPU does not support running Multimodal models"\
+            " without setting `--disable_chunked_mm_input`. " \
+            "Forcing --disable_chunked_mm_input.")
+            scheduler_config.disable_chunked_mm_input = True
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on TPU.")
@@ -133,3 +158,19 @@ class TpuPlatform(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on TPU is experimental
         return True
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+        if isinstance(params, SamplingParams):
+            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
+                raise ValueError("Structured output is not supported on "
+                                 f"{cls.device_name} V0.")
+            if params.sampling_type == SamplingType.RANDOM_SEED:
+                raise ValueError(
+                    "Torch XLA does not support per-request seed.")
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 061232eb11830..f71daf0c19551 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 
 class PoolingParams(
         msgspec.Struct,
@@ -12,14 +15,30 @@ class PoolingParams(
     """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
+        dimensions: Reduce the dimensions of embeddings
+                    if model support matryoshka representation.
         additional_data: Any additional data needed for pooling.
     """
+
+    dimensions: Optional[int] = None
     additional_data: Optional[Any] = None
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
+
+    def verify(self, model_config: "ModelConfig") -> None:
+        if self.dimensions is not None:
+            if not model_config.is_matryoshka:
+                raise ValueError(
+                    f'Model "{model_config.served_model_name}" does not '
+                    f'support matryoshka representation, '
+                    f'changing output dimensions will lead to poor results.')
+            if self.dimensions < 1:
+                raise ValueError("Dimensions must be greater than 0")
 
     def __repr__(self) -> str:
         return (f"PoolingParams("
+                f"dimensions={self.dimensions}, "
                 f"additional_metadata={self.additional_data})")
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/reasoning/__init__.py
similarity index 52%
rename from vllm/entrypoints/openai/reasoning_parsers/__init__.py
rename to vllm/reasoning/__init__.py
index 80354d69b50af..45132a780e5b2 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -2,7 +2,11 @@
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .granite_reasoning_parser import GraniteReasoningParser
 
 __all__ = [
-    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+    "ReasoningParser",
+    "ReasoningParserManager",
+    "DeepSeekR1ReasoningParser",
+    "GraniteReasoningParser",
 ]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
similarity index 82%
rename from vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
rename to vllm/reasoning/abs_reasoning_parsers.py
index c95ff191e4d2e..454167a0dc950 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -17,7 +17,7 @@ logger = init_logger(__name__)
 
 class ReasoningParser:
     """
-    Abstract reasoning parser class that should not be used directly. 
+    Abstract reasoning parser class that should not be used directly.
     Provided and methods should be used in derived classes.
 
     It is used to extract reasoning content from the model output.
@@ -32,6 +32,36 @@ class ReasoningParser:
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+
+    @abstractmethod
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
@@ -53,10 +83,7 @@ class ReasoningParser:
             A tuple containing the reasoning content and the content.
         """
 
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_calls "
-            "has not been implemented!")
-
+    @abstractmethod
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -73,43 +100,6 @@ class ReasoningParser:
         the current tokens/diffs, but also the information about what has
         previously been parsed and extracted (see constructor)
         """
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_content_streaming "
-            "has not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        """
-        Check if the reasoning content ends in the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        bool
-            True if the reasoning content ends in the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.is_reasoning_end has"
-            "not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract content token ids from the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        list[int]
-            The extracted content from the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_content_ids has"
-            " not been implemented!")
 
 
 class ReasoningParserManager:
@@ -125,14 +115,16 @@ class ReasoningParserManager:
         if name in cls.reasoning_parsers:
             return cls.reasoning_parsers[name]
 
-        raise KeyError(f"reasoning helper: '{name}' not found in "
-                       "reasoning_parsers")
+        raise KeyError(
+            f"reasoning helper: '{name}' not found in reasoning_parsers")
 
     @classmethod
-    def _register_module(cls,
-                         module: type,
-                         module_name: Optional[Union[str, list[str]]] = None,
-                         force: bool = True) -> None:
+    def _register_module(
+        cls,
+        module: type,
+        module_name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+    ) -> None:
         if not issubclass(module, ReasoningParser):
             raise TypeError("module must be subclass of ReasoningParser, "
                             f"but got {type(module)}")
@@ -149,13 +141,14 @@ class ReasoningParserManager:
 
     @classmethod
     def register_module(
-            cls,
-            name: Optional[Union[str, list[str]]] = None,
-            force: bool = True,
-            module: Union[type, None] = None) -> Union[type, Callable]:
+        cls,
+        name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+        module: Union[type, None] = None,
+    ) -> Union[type, Callable]:
         """
         Register module with the given name or name list. it can be used as a
-        decoder(with module as None) or normal function(with module as not 
+        decoder(with module as None) or normal function(with module as not
         None).
         """
         if not isinstance(force, bool):
@@ -183,7 +176,7 @@ class ReasoningParserManager:
     @classmethod
     def import_reasoning_parser(cls, plugin_path: str) -> None:
         """
-        Import a user-defined reasoning parser by the path 
+        Import a user-defined reasoning parser by the path
         of the reasoning parser define file.
         """
         module_name = os.path.splitext(os.path.basename(plugin_path))[0]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
similarity index 57%
rename from vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
rename to vllm/reasoning/deepseek_r1_reasoning_parser.py
index 54e960168cf46..1c283c092a28c 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Sequence
 from typing import Optional, Union
 
@@ -8,9 +7,8 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
-    ReasoningParser, ReasoningParserManager)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 
@@ -20,43 +18,42 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     """
     Reasoning parser for DeepSeek R1 model.
 
-    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
     text. This parser extracts the reasoning content from the model output.
     """
 
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
-        self.think_start_token = "<think>"
-        self.think_end_token = "</think>"
-
-        self.reasoning_regex = re.compile(
-            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
                 "constructor during construction.")
 
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if (self.think_start_token_id is None
-                or self.think_end_token_id is None):
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
             raise RuntimeError(
                 "DeepSeek R1 reasoning parser could not locate think start/end "
                 "tokens in the tokenizer!")
 
-    # TODO: need to rebase by PR #14428
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.think_end_token_id in input_ids
+        return self.end_token_id in input_ids
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
         """
-        if self.think_end_token_id not in input_ids[:-1]:
+        if self.end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
 
     def extract_reasoning_content_streaming(
         self,
@@ -77,22 +74,24 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         """
         # Skip single special tokens
         if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
-                self.think_start_token_id, self.think_end_token_id
+                self.start_token_id, self.end_token_id
         ]):
             return None
 
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.
-        if self.think_start_token_id in previous_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in previous, </think> in delta,
                 # extract reasoning content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # <think> in previous, </think> in previous,
                 # reasoning content continues
                 return DeltaMessage(content=delta_text)
@@ -100,17 +99,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
-        elif self.think_start_token_id in delta_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
-                start_index = delta_text.find(self.think_start_token)
-                end_index = delta_text.find(self.think_end_token)
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[start_index +
-                                               len(self.think_start_token
-                                                   ):end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
@@ -119,15 +119,17 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
             # No <think> in previous or delta, also need to check for </think>.
             # Because the model may have generated </think> without <think>
             # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-            if self.think_end_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # </think> in delta with more tokens,
                 # extract reasoning content and content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # </think> in previous, thinking content ends
                 return DeltaMessage(content=delta_text)
             else:
@@ -137,25 +139,34 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
 
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-        if self.think_end_token not in model_output:
+        if self.end_token not in model_output:
             return model_output, None
         else:
-            # Add a start token if it's missing to keep compatibility.
-            if self.think_start_token not in model_output:
-                model_output = f"{self.think_start_token}{model_output}"
-            # Use a regex to find the reasoning content
-            reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
-            end_index = len(
-                f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
-            )
-            final_output = model_output[end_index:]
-
-            if len(final_output) == 0:
-                return reasoning_content, None
-
-            return reasoning_content, final_output
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
new file mode 100644
index 0000000000000..0dae02d33fec7
--- /dev/null
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("granite")
+class GraniteReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for IBM Granite.
+
+    IBM granite models currently use "Here is my thought process:"
+    and "Here is my response:" to separate its thinking / response outputs.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # NOTE: There have been some observed occurrences of quantized
+        # instances of the current models using "Here's" instead of "Here is",
+        # so to be safe, we match on both.
+        self.think_start_expr = r"(?:Here's|Here is) my thought process:"
+        self.response_start_expr = r"(?:Here's|Here is) my response:"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
+            re.DOTALL)
+
+        self.valid_think_starts = [
+            "Here's my thought process:", "Here is my thought process:"
+        ]
+        self.valid_response_starts = [
+            "Here's my response:", "Here is my response:"
+        ]
+
+        # Substrings to match for sequence boundaries on raw text
+        self.seq_boundary_end = ":"
+        self.seq_boundary_start = "Here"
+
+        # The longest any thinking / start of response message can be
+        self.longest_think_start = max(
+            len(think_start) for think_start in self.valid_think_starts)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionRequest): Request being processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+        re_match = self.reasoning_regex.findall(model_output)
+        if not re_match:
+            return None, model_output
+        reasoning_content, response_content = re_match[0]
+        if not response_content:
+            return reasoning_content, None
+        return reasoning_content, response_content
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """Extract the reasoning content / content emitted by granite models;
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        NOTE: Granite models do not use a special token to start their reasoning
+        and response sections; instead they have token sequences, e.g.,
+
+                Here is my thought process: Foo Here is my response: Bar
+
+        This increases the complexity of correctly handling streams, since we
+        need to watch for specific sequences and correctly parse them without
+        dropping content that is potentially overlapping & spanning multiple
+        delta messages.
+
+        Args:
+            previous_text (str): Previous text outside of this delta message.
+            current_text (str): Previous text + delta text.
+            delta_text (str): Text to consider and parse content from.
+            previous_token_ids (Sequence[int]): Token IDs of previous_text.
+            current_token_ids (Sequence[int]): Token IDs of current_text.
+            delta_token_ids (Sequence[int]): Token IDs of delta_text.
+
+        Returns:
+            Union[DeltaMessage, None]
+                DeltaMessage with either reasoning content or content, or None.
+        """
+        reasoning_content, resp_seq_len, content = self._get_content_sections(
+            current_text)
+        # Either we haven't finished the start of the reasoning sequence,
+        # or the model is generating something unexpected.
+        if not reasoning_content:
+            delta_message = self._get_delta_message_with_no_reasoning_bounds(
+                current_text, delta_text)
+        # We have a start of reasoning message, but have not yet finished
+        # the start of response sequence.
+        elif not content:
+            delta_message = self._get_delta_message_with_no_response_bounds(
+                current_text, reasoning_content, delta_text)
+        # We've finished both the start of reasoning and start of response seq.
+        else:
+            # This should never happen since we matched on the response
+            assert resp_seq_len is not None
+            delta_message = self._get_delta_message_with_both_bounds(
+                delta_text, reasoning_content, content, current_text,
+                resp_seq_len)
+        if not delta_message.content and not delta_message.reasoning_content:
+            return None
+        return delta_message
+
+    #### Implementation details of stream parsing for granite models
+    def _is_reasoning_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start reasoning seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible reasoning start seqs match.
+        """
+        return any(
+            think_start.startswith(text)
+            for think_start in self.valid_think_starts)
+
+    def _is_response_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start response seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible response start seqs match.
+        """
+        return any(
+            response_start.startswith(text)
+            for response_start in self.valid_response_starts)
+
+    def _get_delta_message_with_no_reasoning_bounds(
+        self,
+        current_text: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has not yet completed
+        its start of reasoning sequence.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        prev_longest_length = len(current_text) - len(delta_text)
+        is_substr = self._is_reasoning_start_substr(current_text)
+        was_substr = self._is_reasoning_start_substr(
+            current_text[:prev_longest_length])
+
+        # Check if we just generated something NOT in the special token seq;
+        # if so, add everything that we previously skipped with this delta
+        # message and append everything to content in the future.
+        if was_substr and not is_substr:
+            return DeltaMessage(
+                reasoning_content=None,
+                content=current_text,
+            )
+        if is_substr:
+            # Might still be in the special token sequence; return nothing
+            return DeltaMessage(reasoning_content=None, content=None)
+        # Otherwise the sequence has already been broken and we already
+        # corrected; just return the delta text as normal content.
+        return DeltaMessage(reasoning_content=None, content=delta_text)
+
+    def _get_delta_message_with_no_response_bounds(
+        self,
+        current_text: str,
+        reasoning_content: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content with no (response) content. NOTE that we may have overlapping
+        tokens with the start of reasoning / start of response sequences on
+        either side of the delta text.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            reasoning_content (str): reasoning content from current_text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # If we have no reasoning content or explicitly end with the start of
+        # response sequence, we are in transition to the response; need to be
+        # careful here, since the final token (:) will match the reasoning
+        # content and fully parse it out; we should not pass the : back.
+        ends_with_start_response_seq = any(
+            current_text.endswith(response_start)
+            for response_start in self.valid_response_starts)
+        if reasoning_content is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        # Consider previous / current text only within context of the reasoning
+        previous_text = reasoning_content[:-len(delta_text)]
+        current_text = reasoning_content
+
+        # We need to be careful about adding unfinished response sequences;
+        # Find the place at which we MIGHT be starting a response sequence
+        prev_idx = previous_text.rfind(self.seq_boundary_start)
+        delta_idx = delta_text.rfind(self.seq_boundary_start)
+
+        # Check the state of potential start of response substring matches.
+        prev_was_substr = self._is_response_start_substr(
+            previous_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_continues_substr = self._is_response_start_substr(
+            current_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_new_substr = self._is_response_start_substr(
+            delta_text[delta_idx:]) if delta_idx >= 0 else False
+
+        # Delta only contains potential continued response sequence text.
+        if delta_continues_substr:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        if not prev_was_substr:
+            # Delta may be starting a new response seq but has other text too.
+            if delta_new_substr:
+                return DeltaMessage(reasoning_content=delta_text[:delta_idx],
+                                    content=None)
+            # Normal case for most reasoning text (no potential special seqs).
+            return DeltaMessage(reasoning_content=delta_text, content=None)
+        # The substring that previously seemed to be a potential response
+        # seq wasn't one; we need to add the content to the delta message,
+        # and also slice off the potential response sequence
+        elif delta_new_substr:
+            reasoning_content = previous_text[
+                prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning_content=reasoning_content,
+                                content=None)
+        # No new substring yet, and we broke our old one; take the whole delta
+        return DeltaMessage(
+            reasoning_content=previous_text[prev_idx:] + delta_text,
+            content=None,
+        )
+
+    def _get_delta_message_with_both_bounds(
+        self,
+        delta_text: str,
+        reasoning_content: str,
+        response_content: str,
+        current_text: str,
+        response_seq_len: int,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content and normal (response) content.
+
+        Args:
+            delta_text (str): Text to consider and parse content from.
+            reasoning_content (str): reasoning content from current_text.
+            response_content (str): response content from current_text.
+            current_text (str): The full previous + delta text.
+            response_seq_len(str): Len of the complete response sequence used.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # Always have content; take length to the end
+        delta_content = delta_text[-len(response_content):]
+        reasoning_end_idx = len(delta_text) - (len(response_content) +
+                                               response_seq_len)
+
+        if reasoning_end_idx < 0:
+            delta_reasoning_content = None
+        else:
+            # Get the starting offset
+            start_reasoning_content_idx = len(
+                reasoning_content) + response_seq_len + len(
+                    response_content) - 1
+            delta_offset = len(current_text) - len(delta_text)
+            start_offset = start_reasoning_content_idx - delta_offset
+            if start_offset < 0:
+                start_offset = 0
+            delta_reasoning_content = delta_text[
+                start_offset:reasoning_end_idx]
+
+        return DeltaMessage(
+            reasoning_content=delta_reasoning_content,
+            content=delta_content,
+        )
+
+    def _get_content_sections(
+        self, current_text: str
+    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+        """Parse the text to extract the reasoning content / content
+        if we have them.
+
+        Args:
+            current_text (str): The full previous + delta text.
+
+        Returns:
+            tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
+            containing the reasoning content, the length of the response seq
+            (if there is one) and the non-reasoning content.
+        """
+        current_chunk_start = 0
+        start_reasoning_content = None
+        parsed_content = False
+        delimiter_idxs = [
+            idx for idx, char in enumerate(current_text)
+            if char == self.seq_boundary_end
+        ]
+
+        for current_chunk_end in delimiter_idxs:
+            current_chunk = current_text[current_chunk_start:current_chunk_end]
+            # Check to see if the start of reasoning seq if complete
+            if start_reasoning_content is None:
+                for think_start in self.valid_think_starts:
+                    if current_chunk == think_start[:-1]:
+                        start_reasoning_content = current_chunk_end + 1
+                        current_chunk_start = current_chunk_end + 1
+                        break
+
+            # Check to see if the start of response seq if complete
+            elif not parsed_content:
+                for response_start in self.valid_response_starts:
+                    if current_chunk[-len(response_start) +
+                                     1:] == response_start[:-1]:
+                        # Mark end of reasoning and start response content
+                        # after the start of response sequence.
+                        end_reasoning_content = current_chunk_end - len(
+                            response_start)
+                        reasoning_content = current_text[
+                            start_reasoning_content:end_reasoning_content]
+                        response_content = current_text[current_chunk_end + 1:]
+                        return reasoning_content, len(
+                            response_start), response_content
+
+        if start_reasoning_content and not parsed_content:
+            return current_text[start_reasoning_content:], None, None
+        return None, None, None
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 584320e76cbc5..707a757ca83a2 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -79,6 +79,17 @@ class GuidedDecodingParams:
             return []
         return self.backend.split(":")[1].split(",")
 
+    def add_option(self, opt_name: str) -> None:
+        """Adds an option to the backend options."""
+        if not self.backend:
+            self.backend = f":{opt_name}"
+        elif ":" not in self.backend:
+            self.backend += f":{opt_name}"
+        else:
+            options = set(self.backend_options())
+            options.add(opt_name)
+            self.backend = f"{self.backend_name}:{','.join(sorted(options))}"
+
     def no_fallback(self) -> bool:
         """Returns True if the "no-fallback" option is supplied for the guided
         decoding backend"""
@@ -101,7 +112,7 @@ class RequestOutputKind(Enum):
     CUMULATIVE = 0
     # Return only deltas in each RequestOutput
     DELTA = 1
-    # Do not return intermediate RequestOuputs
+    # Do not return intermediate RequestOutput
     FINAL_ONLY = 2
 
 
@@ -385,9 +396,10 @@ class SamplingParams(
         if not -2.0 <= self.frequency_penalty <= 2.0:
             raise ValueError("frequency_penalty must be in [-2, 2], got "
                              f"{self.frequency_penalty}.")
-        if not 0.0 < self.repetition_penalty <= 2.0:
-            raise ValueError("repetition_penalty must be in (0, 2], got "
-                             f"{self.repetition_penalty}.")
+        if self.repetition_penalty <= 0.0:
+            raise ValueError(
+                "repetition_penalty must be greater than zero, got "
+                f"{self.repetition_penalty}.")
         if self.temperature < 0.0:
             raise ValueError(
                 f"temperature must be non-negative, got {self.temperature}.")
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index bc0e0a121cd55..0bb8d602ec8f1 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -8,6 +8,7 @@ import torch
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -89,14 +90,14 @@ class AsyncMetricsCollector:
         self._rank = rank
         if isinstance(device_type, torch.device):
             device_type = device_type.type
-        if device_type == 'cuda':
-            self._copy_stream = torch.cuda.Stream()
+        stream = current_platform.Stream
+        if stream is not None:
+            self._copy_stream = stream()
 
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
-        # currently using cuda.Event, skip for any non_cuda_alike platform
-        from vllm.platforms import current_platform
-        if not current_platform.is_cuda_alike():
+        # Skip for any platform that doesn't have device Event
+        if current_platform.Event is None:
             return None
 
         # If a copy was initiated in the previous call, collect and return.
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index 0a4be23a09362..7ed9ced0e2620 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
     e.g. class that has _field_ 'hex_value', c_uint could be formatted with
       _fmt_ = {"hex_value" : "%08X"}
     to produce nicer output.
-    Default fomratting string for all fields can be set with key "<default>" like:
+    Default formatting string for all fields can be set with key "<default>" like:
       _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
     If not set it's assumed to be just "%s"
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1937b13884711..4e2a31ce67297 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -33,10 +33,11 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                              EAGLEConfig, ExaoneConfig,
                                              H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
-                                             MedusaConfig, MllamaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, NVLM_D_Config,
-                                             Olmo2Config, RWConfig,
+                                             KimiVLConfig, MedusaConfig,
+                                             MllamaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             NVLM_D_Config, Olmo2Config,
+                                             RWConfig, SkyworkR1VChatConfig,
                                              SolarConfig, Telechat2Config,
                                              UltravoxConfig)
 # yapf: enable
@@ -62,6 +63,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
     "deepseek_vl_v2": DeepseekVLV2Config,
+    "kimi_vl": KimiVLConfig,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
@@ -76,6 +78,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "NVLM_D": NVLM_D_Config,
     "olmo2": Olmo2Config,
     "solar": SolarConfig,
+    "skywork_chat": SkyworkR1VChatConfig,
     "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
@@ -219,8 +222,7 @@ def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
-def uses_mrope(config: PretrainedConfig) -> bool:
-    """Detect if the model with this config uses M-ROPE."""
+def _uses_mrope(config: PretrainedConfig) -> bool:
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling is None:
         return False
@@ -228,6 +230,24 @@ def uses_mrope(config: PretrainedConfig) -> bool:
     return "mrope_section" in rope_scaling
 
 
+def uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config uses M-ROPE."""
+    return _uses_mrope(config) or thinker_uses_mrope(config)
+
+
+def thinker_uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model contains a thinker config and it uses M-ROPE."""
+    thinker_config = getattr(config, "thinker_config", None)
+    if thinker_config is None:
+        return False
+
+    thinker_text_config = getattr(thinker_config, "text_config", None)
+    if thinker_text_config is None:
+        return False
+
+    return uses_mrope(thinker_text_config)
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
     text_config = getattr(config, "text_config", None)
@@ -261,6 +281,11 @@ def get_config(
                                      MISTRAL_CONFIG_NAME,
                                      revision=revision):
                 config_format = ConfigFormat.MISTRAL
+            else:
+                raise ValueError(
+                    "Could not detect config format for no config file found. "
+                    "Ensure your model has either config.json (HF format) "
+                    "or params.json (Mistral format).")
 
         except Exception as e:
             error_message = (
@@ -323,7 +348,14 @@ def get_config(
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
     else:
-        raise ValueError(f"Unsupported config format: {config_format}")
+        supported_formats = [
+            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
+        ]
+        raise ValueError(
+            f"Unsupported config format: {config_format}. "
+            f"Supported formats are: {', '.join(supported_formats)}. "
+            f"Ensure your model uses one of these configuration formats "
+            f"or specify the correct format explicitly.")
 
     # Special architecture mapping check for GGUF models
     if is_gguf:
@@ -658,6 +690,9 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
                 "quant_method": "fp8",
                 "activation_scheme": "static"
             }
+        elif quantization.get("quant_method") == "compressed-tensors":
+            # Pass through the quantization config to compressed-tensors
+            quantization_config = quantization
         else:
             raise ValueError(
                 f"Found unknown quantization='{quantization}' in config")
@@ -675,6 +710,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
 
     if config_type == "multimodal":
         multimodal_config = config_dict.pop("vision_encoder")
+        quantization_config = config_dict.get("quantization_config", {})
 
         config_dict = {
             "text_config": config_dict,
@@ -682,6 +718,8 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
         }
         config_dict["architectures"] = ["PixtralForConditionalGeneration"]
         config_dict["model_type"] = "pixtral"
+        if quantization_config:
+            config_dict["quantization_config"] = quantization_config
 
     config_dict.update(kwargs)
 
@@ -699,6 +737,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
 
 def get_hf_image_processor_config(
     model: Union[str, Path],
+    hf_token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     **kwargs,
 ) -> Dict[str, Any]:
@@ -708,7 +747,10 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
-    return get_image_processor_config(model, revision=revision, **kwargs)
+    return get_image_processor_config(model,
+                                      token=hf_token,
+                                      revision=revision,
+                                      **kwargs)
 
 
 def get_hf_text_config(config: PretrainedConfig):
@@ -721,6 +763,11 @@ def get_hf_text_config(config: PretrainedConfig):
         # if transformers config doesn't align with this assumption.
         assert hasattr(config.text_config, "num_attention_heads")
         return config.text_config
+    elif hasattr(config, "thinker_config"):
+        # TODO(suyang.fy): Refactor code.
+        #  For Qwen2.5-Omni, change hf_text_config to
+        #  thinker_config.text_config.
+        return config.thinker_config.text_config
     else:
         return config
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 9060565596b21..739eea5cba515 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -13,13 +13,16 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
 from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
@@ -39,9 +42,12 @@ __all__ = [
     "ExaoneConfig",
     "MllamaConfig",
     "MLPSpeculatorConfig",
+    "MoonViTConfig",
+    "KimiVLConfig",
     "NemotronConfig",
     "NVLM_D_Config",
     "Olmo2Config",
+    "SkyworkR1VChatConfig",
     "SolarConfig",
     "Telechat2Config",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index b26aba66699fd..3a9ad3e0ffc81 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -5,6 +5,9 @@ from typing import Optional, Union
 
 from transformers import AutoConfig, PretrainedConfig
 
+import vllm.envs as envs
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
 
 class EAGLEConfig(PretrainedConfig):
     model_type = "eagle"
@@ -14,8 +17,17 @@ class EAGLEConfig(PretrainedConfig):
                  truncated_vocab_size: Optional[int] = None,
                  **kwargs):
 
-        model_config = None if model is None else (AutoConfig.for_model(
-            **model) if isinstance(model, dict) else model)
+        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        if isinstance(model, dict):
+            archs = model.get("architectures", [])
+            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
+            if any(target_arch in archs for target_arch in target_archs):
+                # AutoConfig does not support DeepSeek MoE models yet
+                model_config = DeepseekV2Config(**model)
+            else:
+                model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
 
         for k, v in kwargs.items():
             if k != "architectures" and k != "model_type" and hasattr(
@@ -30,8 +42,10 @@ class EAGLEConfig(PretrainedConfig):
             self.truncated_vocab_size = self.model.vocab_size if \
                 truncated_vocab_size is None else truncated_vocab_size
 
-        if "architectures" not in kwargs:
+        if not envs.VLLM_USE_V1:
             kwargs["architectures"] = ["EAGLEModel"]
+        else:
+            kwargs["architectures"] = ["EagleLlamaForCausalLM"]
 
         super().__init__(**kwargs)
 
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
new file mode 100644
index 0000000000000..97ff44bb9c1c9
--- /dev/null
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(self,
+                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+                 ignore_index: int = -100,
+                 media_placeholder_token_id: int = 163605,
+                 pad_token_id: int = 0,
+                 **kwargs):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
new file mode 100644
index 0000000000000..a2b4059a63efb
--- /dev/null
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+            self,
+            patch_size: int = 14,
+            init_pos_emb_height: int = 64,
+            init_pos_emb_width: int = 64,
+            num_attention_heads: int = 16,
+            num_hidden_layers: int = 27,
+            hidden_size: int = 1152,
+            intermediate_size: int = 4304,
+            merge_kernel_size: tuple[int, int] = (2, 2),
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
new file mode 100644
index 0000000000000..ef5f9ba85c237
--- /dev/null
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class SkyworkR1VChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 1d09b99d50c06..ed2f4b076ded4 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -111,6 +111,55 @@ def cached_processor_from_config(
     )
 
 
+def get_feature_extractor(
+    processor_name: str,
+    *args: Any,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an audio feature extractor for the given model name 
+    via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoFeatureExtractor
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    try:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the feature extractor. If the feature "
+                "extractor is a custom extractor not yet available in the "
+                "HuggingFace transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return cast(FeatureExtractionMixin, feature_extractor)
+
+
+cached_get_feature_extractor = lru_cache(get_feature_extractor)
+
+
+def cached_feature_extractor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_feature_extractor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
 def get_image_processor(
     processor_name: str,
     *args: Any,
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index d37381ea9925f..316281f2af4e5 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -226,7 +226,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         input_ids[input_ids < 0] = self.pad_id
 
         if inference_mode:
-            # 去掉结尾的eos token
+            # Remove the ending eos token
             assert input_ids[-1] == self.eos_id
             input_ids = input_ids[:-1]
             target_ids = target_ids[:-1]
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index c12388d9b20bc..7aac29a6bf967 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
-                      truncate_tool_call_ids)
+                      truncate_tool_call_ids, validate_request_params)
 
 __all__ = [
-    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids"
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+    "validate_request_params"
 ]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 2d036e2c83f74..58a114fa3a32f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -98,6 +98,13 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"):
                 request.messages[i]["tool_call_id"] = tool_call_id
 
 
+def validate_request_params(request: "ChatCompletionRequest"):
+    if (request.skip_special_tokens is not None
+            and not request.skip_special_tokens):
+        raise ValueError("skip_special_tokens=False is not supported "
+                         "for Mistral tokenizers.")
+
+
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
     repo_cache = os.path.join(
         huggingface_hub.constants.HF_HUB_CACHE,
@@ -124,13 +131,15 @@ def find_tokenizer_file(files: List[str]):
 
     matched_files = [file for file in files if file_pattern.match(file)]
     if len(matched_files) > 1:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure only one Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
+            f"tokenizer is present in {files}.")
     elif len(matched_files) == 0:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure that a Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
+            f"tokenizer is present in {files}.")
 
     return matched_files[0]
 
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 87e446f894384..81eb4d9b6abc3 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,9 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+from functools import cache
 from os import PathLike
 from pathlib import Path
 from typing import List, Optional, Union
 
+from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def is_s3(model_or_path: str) -> bool:
     return model_or_path.lower().startswith('s3://')
@@ -17,9 +24,14 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
     elif model.suffix == ".gguf":
         return True
 
-    with open(model, "rb") as f:
-        header = f.read(4)
-    return header == b"GGUF"
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
 
 
 def modelscope_list_repo_files(
@@ -38,3 +50,49 @@ def modelscope_list_repo_files(
         if file['Type'] == 'blob'
     ]
     return files
+
+
+def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    with open(path) as f:
+        try:
+            return json.loads(f.read())
+        except Exception:
+            return dict[str, str]()
+
+
+def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    parsed_dict = dict[str, str]()
+    with open(path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.strip().split()
+                parsed_dict[model_name] = redirect_name
+            except Exception:
+                pass
+    return parsed_dict
+
+
+@cache
+def maybe_model_redirect(model: str) -> str:
+    """
+    Use model_redirect to redirect the model name to a local folder.
+
+    :param model: hf model name
+    :return: maybe redirect to a local folder
+    """
+
+    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+
+    if not model_redirect_path:
+        return model
+
+    if not Path(model_redirect_path).exists():
+        return model
+
+    redirect_dict = (_maybe_json_dict(model_redirect_path)
+                     or _maybe_space_split_dict(model_redirect_path))
+    if (redirect_model := redirect_dict.get(model)):
+        logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
+        return redirect_model
+
+    return model
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e14a628993f6..c65a370bd530a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import argparse
 import asyncio
 import concurrent
 import contextlib
@@ -10,6 +9,7 @@ import datetime
 import enum
 import gc
 import getpass
+import hashlib
 import importlib
 import importlib.metadata
 import importlib.util
@@ -17,12 +17,14 @@ import inspect
 import ipaddress
 import multiprocessing
 import os
+import pickle
 import re
 import signal
 import socket
 import subprocess
 import sys
 import tempfile
+import textwrap
 import threading
 import time
 import traceback
@@ -30,16 +32,20 @@ import types
 import uuid
 import warnings
 import weakref
+from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
+                      ArgumentTypeError)
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import OrderedDict, UserDict, defaultdict
+from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
-                             Iterable, Iterator, Mapping)
+                             Iterable, Iterator, KeysView, Mapping)
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
+from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Type, TypeVar, Union)
+                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
 from uuid import uuid4
 
+import cachetools
 import cloudpickle
 import numpy as np
 import numpy.typing as npt
@@ -49,6 +55,7 @@ import torch.types
 import yaml
 import zmq
 import zmq.asyncio
+from packaging import version
 from packaging.version import Version
 from torch.library import Library
 from typing_extensions import Never, ParamSpec, TypeIs, assert_never
@@ -57,7 +64,7 @@ import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
 logger = init_logger(__name__)
 
@@ -171,6 +178,7 @@ U = TypeVar("U")
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
+_T = TypeVar("_T")
 
 
 class _Sentinel:
@@ -204,6 +212,19 @@ class Counter:
         self.counter = 0
 
 
+class _MappingOrderCacheView(UserDict[_K, _V]):
+
+    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
+        super().__init__(data)
+        self.ordered_keys = ordered_keys
+
+    def __iter__(self) -> Iterator[_K]:
+        return iter(self.ordered_keys)
+
+    def keys(self) -> KeysView[_K]:
+        return KeysView(self.ordered_keys)
+
+
 class CacheInfo(NamedTuple):
     hits: int
     total: int
@@ -215,46 +236,108 @@ class CacheInfo(NamedTuple):
 
         return self.hits / self.total
 
+    def __sub__(self, other: CacheInfo):
+        return CacheInfo(
+            hits=self.hits - other.hits,
+            total=self.total - other.total,
+        )
 
-class LRUCache(Generic[_K, _V]):
-    """Note: This class is not thread safe!"""
 
-    def __init__(self, capacity: int) -> None:
-        self.cache = OrderedDict[_K, _V]()
+class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
+
+    def __init__(self,
+                 capacity: float,
+                 getsizeof: Optional[Callable[[_V], float]] = None):
+        super().__init__(capacity, getsizeof)
+
         self.pinned_items = set[_K]()
-        self.capacity = capacity
 
         self._hits = 0
         self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
 
-    def __contains__(self, key: _K) -> bool:
-        return key in self.cache
+    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
+        value = super().__getitem__(key)
 
-    def __len__(self) -> int:
-        return len(self.cache)
+        if update_info:
+            self._hits += 1
+            self._total += 1
 
-    def __getitem__(self, key: _K) -> _V:
-        value = self.cache[key]  # Raise KeyError if not exists
-        self.cache.move_to_end(key)
         return value
 
-    def __setitem__(self, key: _K, value: _V) -> None:
-        self.put(key, value)
-
     def __delitem__(self, key: _K) -> None:
-        self.pop(key)
+        run_on_remove = key in self
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
+        super().__delitem__(key)
+        if key in self.pinned_items:
+            # Todo: add warning to inform that del pinned item
+            self._unpin(key)
+        if run_on_remove:
+            self._on_remove(key, value)
 
-    def stat(self) -> CacheInfo:
-        return CacheInfo(hits=self._hits, total=self._total)
+    @property
+    def cache(self) -> Mapping[_K, _V]:
+        """Return the internal cache dictionary in order (read-only)."""
+        return _MappingOrderCacheView(
+            self._Cache__data,  # type: ignore
+            self.order)
+
+    @property
+    def order(self) -> Mapping[_K, None]:
+        """Return the internal order dictionary (read-only)."""
+        return MappingProxyType(self._LRUCache__order)  # type: ignore
+
+    @property
+    def capacity(self) -> float:
+        return self.maxsize
+
+    @property
+    def usage(self) -> float:
+        if self.maxsize == 0:
+            return 0
+
+        return self.currsize / self.maxsize
+
+    def stat(self, *, delta: bool = False) -> CacheInfo:
+        """
+        Gets the cumulative number of hits and queries against this cache.
+
+        If :code:`delta=True`, instead gets these statistics
+        since the last call that also passed :code:`delta=True`.
+        """
+        info = CacheInfo(hits=self._hits, total=self._total)
+
+        if delta:
+            info_delta = info - self._last_info
+            self._last_info = info
+            info = info_delta
+
+        return info
 
     def touch(self, key: _K) -> None:
-        self.cache.move_to_end(key)
+        try:
+            self._LRUCache__order.move_to_end(key)  # type: ignore
+        except KeyError:
+            self._LRUCache__order[key] = None  # type: ignore
 
-    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        value: Optional[_V]
-        if key in self.cache:
-            value = self.cache[key]
-            self.cache.move_to_end(key)
+    @overload
+    def get(self, key: _K, /) -> Optional[_V]:
+        ...
+
+    @overload
+    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def get(self,
+            key: _K,
+            /,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key in self:
+            value = self.__getitem__(
+                key, update_info=False)  # type: ignore[call-arg]
 
             self._hits += 1
         else:
@@ -263,60 +346,81 @@ class LRUCache(Generic[_K, _V]):
         self._total += 1
         return value
 
+    @overload
+    def pop(self, key: _K) -> _V:
+        ...
+
+    @overload
+    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def pop(self,
+            key: _K,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key not in self:
+            return default
+
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
+        self.__delitem__(key)
+        return value
+
     def put(self, key: _K, value: _V) -> None:
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        self._remove_old_if_needed()
+        self.__setitem__(key, value)
 
     def pin(self, key: _K) -> None:
         """
         Pins a key in the cache preventing it from being
         evicted in the LRU order.
         """
-        if key not in self.cache:
+        if key not in self:
             raise ValueError(f"Cannot pin key: {key} not in cache.")
         self.pinned_items.add(key)
 
     def _unpin(self, key: _K) -> None:
+        """
+        Unpins a key in the cache allowing it to be
+        evicted in the LRU order.
+        """
         self.pinned_items.remove(key)
 
     def _on_remove(self, key: _K, value: Optional[_V]) -> None:
         pass
 
     def remove_oldest(self, *, remove_pinned: bool = False) -> None:
-        if not self.cache:
+        if len(self) == 0:
             return
 
+        self.popitem(remove_pinned=remove_pinned)
+
+    def _remove_old_if_needed(self) -> None:
+        while self.currsize > self.capacity:
+            self.remove_oldest()
+
+    def popitem(self, remove_pinned: bool = False):
+        """Remove and return the `(key, value)` pair least recently used."""
         if not remove_pinned:
             # pop the oldest item in the cache that is not pinned
             lru_key = next(
-                (key for key in self.cache if key not in self.pinned_items),
+                (key for key in self.order if key not in self.pinned_items),
                 ALL_PINNED_SENTINEL)
             if lru_key is ALL_PINNED_SENTINEL:
                 raise RuntimeError("All items are pinned, "
                                    "cannot remove oldest from the cache.")
         else:
-            lru_key = next(iter(self.cache))
-        self.pop(lru_key)  # type: ignore
-
-    def _remove_old_if_needed(self) -> None:
-        while len(self.cache) > self.capacity:
-            self.remove_oldest()
-
-    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        run_on_remove = key in self.cache
-        value = self.cache.pop(key, default)
-        # remove from pinned items
-        if key in self.pinned_items:
-            self._unpin(key)
-        if run_on_remove:
-            self._on_remove(key, value)
-        return value
+            lru_key = next(iter(self.order))
+        value = self.pop(cast(_K, lru_key))
+        return (lru_key, value)
 
     def clear(self) -> None:
-        while len(self.cache) > 0:
+        while len(self) > 0:
             self.remove_oldest(remove_pinned=True)
-        self.cache.clear()
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
 
 
 class PyObjectCache:
@@ -524,12 +628,12 @@ def get_open_port() -> int:
     process. Currently it uses 2 ports.
     """
     if "VLLM_DP_MASTER_PORT" in os.environ:
-        dp_port = envs.VLLM_DP_MASTER_PORT
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
         while True:
-            port = _get_open_port()
-            if port >= dp_port and port < dp_port + 10:
-                continue
-            return port
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
     return _get_open_port()
 
 
@@ -744,6 +848,14 @@ def is_pin_memory_available() -> bool:
     return current_platform.is_pin_memory_available()
 
 
+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
+
+
 class DeviceMemoryProfiler:
 
     def __init__(self, device: Optional[torch.types.Device] = None):
@@ -1149,7 +1261,7 @@ def run_once(f: Callable[P, None]) -> Callable[P, None]:
     return wrapper
 
 
-class StoreBoolean(argparse.Action):
+class StoreBoolean(Action):
 
     def __call__(self, parser, namespace, values, option_string=None):
         if values.lower() == "true":
@@ -1161,15 +1273,28 @@ class StoreBoolean(argparse.Action):
                              "Expected 'true' or 'false'.")
 
 
-class SortedHelpFormatter(argparse.HelpFormatter):
+class SortedHelpFormatter(ArgumentDefaultsHelpFormatter):
     """SortedHelpFormatter that sorts arguments by their option strings."""
 
+    def _split_lines(self, text, width):
+        """
+        1. Sentences split across lines have their single newlines removed.
+        2. Paragraphs and explicit newlines are split into separate lines.
+        3. Each line is wrapped to the specified width (width of terminal).
+        """
+        # The patterns also include whitespace after the newline
+        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
+        multiple_newlines = re.compile(r"\n{2,}\s*")
+        text = single_newline.sub(' ', text)
+        lines = re.split(multiple_newlines, text)
+        return sum([textwrap.wrap(line, width) for line in lines], [])
+
     def add_arguments(self, actions):
         actions = sorted(actions, key=lambda x: x.option_strings)
         super().add_arguments(actions)
 
 
-class FlexibleArgumentParser(argparse.ArgumentParser):
+class FlexibleArgumentParser(ArgumentParser):
     """ArgumentParser that allows both underscore and dash in names."""
 
     def __init__(self, *args, **kwargs):
@@ -1182,6 +1307,16 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
         if args is None:
             args = sys.argv[1:]
 
+        # Check for --model in command line arguments first
+        if args and args[0] == "serve":
+            model_in_cli_args = any(arg == '--model' for arg in args)
+
+            if model_in_cli_args:
+                raise ValueError(
+                    "With `vllm serve`, you should provide the model as a "
+                    "positional argument or in a config file instead of via "
+                    "the `--model` option.")
+
         if '--config' in args:
             args = self._pull_args_from_config(args)
 
@@ -1210,11 +1345,10 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
             value = int(value)
         except ValueError:
             msg = "Port must be an integer"
-            raise argparse.ArgumentTypeError(msg) from None
+            raise ArgumentTypeError(msg) from None
 
         if not (1024 <= value <= 65535):
-            raise argparse.ArgumentTypeError(
-                "Port must be between 1024 and 65535")
+            raise ArgumentTypeError("Port must be between 1024 and 65535")
 
         return value
 
@@ -1265,19 +1399,29 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
         config_args = self._load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
-        # followed by model_tag (only for serve)
+        # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
         if args[0] == "serve":
-            if index == 1:
+            model_in_cli = len(args) > 1 and not args[1].startswith('-')
+            model_in_config = any(arg == '--model' for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
                 raise ValueError(
-                    "No model_tag specified! Please check your command-line"
-                    " arguments.")
-            args = [args[0]] + [
-                args[1]
-            ] + config_args + args[2:index] + args[index + 2:]
+                    "No model specified! Please specify model either "
+                    "as a positional argument or in a config file.")
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = [args[0]] + [
+                    args[1]
+                ] + config_args + args[2:index] + args[index + 2:]
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]
+                        ] + config_args + args[1:index] + args[index + 2:]
         else:
             args = [args[0]] + config_args + args[1:index] + args[index + 2:]
 
@@ -1295,9 +1439,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
                 '--port': '12323',
                 '--tensor-parallel-size': '4'
             ]
-
         """
-
         extension: str = file_path.split('.')[-1]
         if extension not in ('yaml', 'yml'):
             raise ValueError(
@@ -1566,18 +1708,21 @@ class ClassRegistry(UserDict[Type[T], _V]):
         return any(cls in self.data for cls in key.mro())
 
 
-def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
+def weak_ref_tensor(tensor: Any) -> Any:
     """
     Create a weak reference to a tensor.
     The new tensor will share the same data as the original tensor,
     but will not keep the original tensor alive.
     """
-    return torch.ops._C.weak_ref_tensor(tensor)
+    if isinstance(tensor, torch.Tensor):
+        return torch.ops._C.weak_ref_tensor(tensor)
+    else:
+        return tensor
 
 
 def weak_ref_tensors(
     tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
-) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]:
+) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
     """
     Convenience function to create weak references to tensors,
     for single tensor, list of tensors or tuple of tensors.
@@ -1591,6 +1736,14 @@ def weak_ref_tensors(
     raise ValueError("Invalid type for tensors")
 
 
+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
 def is_in_doc_build() -> bool:
     try:
         from sphinx.ext.autodoc.mock import _MockModule
@@ -1847,12 +2000,13 @@ vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 
 
 def direct_register_custom_op(
-    op_name: str,
-    op_func: Callable,
-    mutates_args: list[str],
-    fake_impl: Optional[Callable] = None,
-    target_lib: Optional[Library] = None,
-    dispatch_key: str = "CUDA",
+        op_name: str,
+        op_func: Callable,
+        mutates_args: list[str],
+        fake_impl: Optional[Callable] = None,
+        target_lib: Optional[Library] = None,
+        dispatch_key: str = "CUDA",
+        tags: Tuple[torch.Tag, ...] = (),
 ):
     """
     `torch.library.custom_op` can have significant overhead because it
@@ -1891,7 +2045,7 @@ def direct_register_custom_op(
         import torch._custom_op.impl
         schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
-    my_lib.define(op_name + schema_str)
+    my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
     if fake_impl is not None:
         my_lib._register_fake(op_name, fake_impl)
@@ -2101,6 +2255,8 @@ def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
+    identity: Optional[bytes] = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2119,33 +2275,50 @@ def make_zmq_socket(
     else:
         buf_size = -1  # Use system default buffer size
 
-    if socket_type == zmq.constants.PULL:
-        socket.setsockopt(zmq.constants.RCVHWM, 0)
-        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
-        socket.connect(path)
-    elif socket_type == zmq.constants.PUSH:
-        socket.setsockopt(zmq.constants.SNDHWM, 0)
-        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
+    if bind is None:
+        bind = socket_type != zmq.PUSH
+
+    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    if identity is not None:
+        socket.setsockopt(zmq.IDENTITY, identity)
+
+    if bind:
         socket.bind(path)
     else:
-        raise ValueError(f"Unknown Socket Type: {socket_type}")
+        socket.connect(path)
 
     return socket
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    bind: Optional[bool] = None,
+    linger: int = 0,
+    identity: Optional[bytes] = None,
+) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        yield make_zmq_socket(ctx, path, socket_type)
-
+        yield make_zmq_socket(ctx,
+                              path,
+                              socket_type,
+                              bind=bind,
+                              identity=identity)
     except KeyboardInterrupt:
         logger.debug("Got Keyboard Interrupt.")
 
     finally:
-        ctx.destroy(linger=0)
+        ctx.destroy(linger=linger)
 
 
 def is_in_ray_actor():
@@ -2442,3 +2615,50 @@ def cprofile(save_file: Optional[str] = None, enabled: bool = True):
         return wrapper
 
     return decorator
+
+
+# Only relevant for models using ALiBi (e.g, MPT)
+def check_use_alibi(model_config: ModelConfig) -> bool:
+    return (getattr(model_config.hf_text_config, "alibi", False)  # Falcon
+            or ("BloomForCausalLM" in getattr(model_config.hf_config,
+                                              "architectures", []))  # Bloom
+            or getattr(model_config.hf_text_config, "position_encoding_type",
+                       "") == "alibi"  # codellm_1b_alibi
+            or
+            (hasattr(model_config.hf_text_config, "attn_config")  # MPT
+             and model_config.hf_text_config.attn_config.get("alibi", False)))
+
+
+def sha256(input) -> int:
+    """Hash any picklable Python object using SHA-256.
+
+    The input is serialized using pickle before hashing, which allows
+    arbitrary Python objects to be used. Note that this function does
+    not use a hash seed—if you need one, prepend it explicitly to the input.
+
+    Args:
+        input: Any picklable Python object.
+
+    Returns:
+        An integer representing the SHA-256 hash of the serialized input.
+    """
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return int.from_bytes(hashlib.sha256(input_bytes).digest(),
+                          byteorder="big")
+
+
+def is_torch_equal_or_newer(target: str) -> bool:
+    """Check if the installed torch version is >= the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        torch_version = version.parse(str(torch.__version__))
+        return torch_version >= version.parse(target)
+    except Exception:
+        # Fallback to PKG-INFO to load the package info, needed by the doc gen.
+        return Version(importlib.metadata.version('torch')) >= Version(target)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 92e4ffd0371ad..51ae386d33898 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -23,7 +23,8 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 if current_platform.is_cuda():
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                      get_scheduler_metadata)
 
 logger = init_logger(__name__)
 
@@ -63,10 +64,6 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return use_cascade_attention(*args, **kwargs)
-
 
 @dataclass
 class FlashAttentionMetadata:
@@ -93,14 +90,205 @@ class FlashAttentionMetadata:
     prefix_kv_lens: Optional[torch.Tensor]
     suffix_kv_lens: Optional[torch.Tensor]
 
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
+    # for local attention
+    @dataclass
+    class LocalAttentionMetadata:
+        local_query_start_loc: torch.Tensor
+        local_seqused_k: torch.Tensor
+        local_block_table: torch.Tensor
+        local_max_query_len: int
+        local_max_seq_len: int
+        local_scheduler_metadata: Optional[torch.Tensor]
+
+    local_attn_metadata: Optional[LocalAttentionMetadata] = None
+
+
+#
+# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+# local attention blocks, where each block is passed to the attention kernel
+# as an independent local ("virtual") batch item.
+#
+# For example, if are performing a chunked prefill a batch of 3 sequences:
+#   q_seqlens  = [4, 10, 5]
+#   kv_seqlens = [6, 17, 9]
+# Then normally for regular attention we would compute with an attention mask
+#  for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
+#   batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 | 1 1 1 1 1
+#               3 | 1 1 1 1 1 1
+#
+# for local attention (with attn_chunk_size = 4) we would compute with an
+#  attention mask like:
+#   batch idx: 0  (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 |         1
+#               3 |         1 1
+#
+# We can simulate this mask using standard flash-attention by breaking the
+#  sequences into local ("virtual") batches, where each local batch item is a
+#  local attention block, so in this case batch idx 0 would be broken up into:
+#
+#   local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4)  (batch 0)
+#        k_toks >   0 1 2 3
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#   local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
+#        k_toks >   4 5
+#        q_toks v  _____________
+#               2 | 1
+#               3 | 1 1
+#
+# e.g. if we have:
+#   attn_chunk_size = 4
+#   query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
+# Then this function would return:
+#                           __b0__  ______b1______  __b2__ < orig batch indices
+#   q_seqlens_local    = [   2,  2,  1,  4,  4,  1,  4,  1]
+#   cu_seqlens_q_local = [0, 4,  6, 10, 14, 18, 19, 23, 24]
+#   seqlens_k_local    = [   4,  2,  4,  4,  4,  1,  4,  1]
+#   block_table_local  : shape[local_virtual_batches, pages_per_local_batch]
+def make_local_attention_virtual_batches(
+    attn_chunk_size: int,
+    query_start_loc_np: np.ndarray,
+    seq_lens_np: np.ndarray,
+    block_table: torch.Tensor,
+    page_size: int = 0,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]:
+    q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
+    actual_batch_size = seq_lens_np.shape[0]
+
+    # Handle if we are starting in the middle of a local attention block,
+    #  we assume q_seqlens > 0 (for all elements), for each batch idx we compute
+    #  the number of tokens that are not in the first local attention block and
+    #  then we can simply use a cdiv for the rest.
+    # For example if we have:
+    #   attn_chunk_size = 4
+    #   q_seqlens = [4, 10, 5]
+    #   k_seqlens = [6, 17, 9]
+    # Then we would get:
+    #   new_tokens_in_first_block = [2, 1, 4]
+    #   local_blocks = [2, 4, 2]
+    q_tokens_in_first_block = np.minimum(
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size),
+        q_seqlens).astype(np.int32)
+    tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block,
+                            attn_chunk_size)
+
+    # Once we know the number of local blocks we can compute the request spans
+    #  for each batch idx, we can figure out the number of "virtual" requests we
+    #  have to make,
+    # For the above example we would get:
+    #   seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
+    #
+    # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
+    #   (TODO: max a utility to share this code with _prepare_inputs)
+    # arange step 1. [2, 4, 2] -> [2, 6, 8]
+    cu_num_blocks = np.cumsum(local_blocks)
+    virtual_batches = cu_num_blocks[-1]
+    # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
+    block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
+    # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
+    arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+    # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
+    rarange = np.repeat(local_blocks, local_blocks) - arange - 1
+    # Then we can compute the seqlens_q_local, handling the fact that the
+    #  first and last blocks could be partial
+    seqlens_q_local = \
+        np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    # set the first block since this may be a partial block
+    seqlens_q_local[arange == 0] = q_tokens_in_first_block
+    # set the remaining blocks
+    seqlens_q_local[arange > 0] = np.minimum(
+        seqlens_q_local - attn_chunk_size * (arange - 1),
+        attn_chunk_size)[arange > 0]
+
+    # convert from q_seqlens to cu_seqlens_q
+    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\
+        .astype(np.int32)
+
+    # compute the seqlens_k_local,
+    #  basically a full local attention block for all but the last block in each
+    #  batch
+    # For our example this will be:
+    #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
+    seqlens_k_local = np.full(cu_num_blocks[-1],
+                              attn_chunk_size,
+                              dtype=np.int32)
+    seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
+
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - \
+        (rarange * attn_chunk_size + \
+            np.repeat(tokens_in_last_block, local_blocks))
+    # For the example the local attention blocks start at:
+    #                           _b0_  _____b1_____  _b2_
+    #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
+    block_starts = k_seqstarts_absolute // page_size
+    assert attn_chunk_size % page_size == 0, \
+        f"attn_chunk_size {attn_chunk_size} is not " \
+        f"divisible by page_size {page_size}"
+    pages_per_local_batch = attn_chunk_size // page_size
+
+    # Create a block_table for the local attention blocks
+    # For out example if we have a block-table like (assuming page_size=2):
+    #   block_table = [
+    #     [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],  < batch 0
+    #     [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  < batch 1
+    #     [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],  < batch 2
+    #   ]
+    # Then for the local batches we would want a block-table like
+    #   block_table_local = [
+    #     [  0,  1 ], < local-batch 0, (batch 0, starting from k[0])
+    #     [  2,  3 ], < local-batch 1, (batch 0, starting from k[4])
+    #     [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
+    #     [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
+    #     [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
+    #     [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
+    #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
+    #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
+    #   ]
+    block_indices= np.broadcast_to(
+        np.arange(pages_per_local_batch, dtype=np.int32),
+        (virtual_batches, pages_per_local_batch)) \
+            + np.expand_dims(block_starts, axis=1)
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
+                              local_blocks * pages_per_local_batch)
+    block_table_local = block_table[batch_indices, block_indices]\
+        .view(virtual_batches, -1)
+
+    return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, \
+        block_table_local
+
 
 class FlashAttentionMetadataBuilder:
 
     def __init__(self, runner: "GPUModelRunner"):
+        model_config = runner.model_config
+
         self.runner = runner
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -109,18 +297,70 @@ class FlashAttentionMetadataBuilder:
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
         max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
-        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
-            self.runner.device, non_blocking=True)
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
-                                                          non_blocking=True)
+        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+                                                 non_blocking=True)
+        seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
+        seq_lens = seq_lens_cpu.to(self.runner.device, non_blocking=True)
         block_table = (
             self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             self.runner.device, non_blocking=True).long()
 
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            if self.aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=batch_size,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads_q,
+                    num_heads_kv=self.num_heads_kv,
+                    headdim=self.headdim,
+                    page_size=self.page_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                )
+            return None
+
+        # for local attention
+        local_attn_metadata = None
+        if self.runner.attention_chunk_size is not None:
+            seqlens_q_local_np, virt_q_cu_seqlens_np, virt_k_seqlens_np, \
+                virt_block_table = make_local_attention_virtual_batches(
+                    self.runner.attention_chunk_size,
+                    self.runner.query_start_loc_np[:num_reqs + 1],
+                    self.runner.seq_lens_np[:num_reqs],
+                    block_table,
+                    self.runner.block_size,
+                )
+            local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_max_query_len = seqlens_q_local_np.max()
+            local_max_seq_len = virt_k_seqlens_np.max()
+            local_scheduler_metadata = schedule(
+                batch_size=local_query_start_loc.shape[0] - 1,
+                cu_query_lens=local_query_start_loc,
+                max_query_len=local_max_query_len,
+                seqlens=local_seqused_k,
+                max_seq_len=local_max_seq_len,
+                causal=True)
+
+            local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+                local_query_start_loc=local_query_start_loc,
+                local_seqused_k=local_seqused_k,
+                local_block_table=virt_block_table,
+                local_max_query_len=local_max_query_len,
+                local_max_seq_len=local_max_seq_len,
+                local_scheduler_metadata=local_scheduler_metadata,
+            )
+
         use_cascade = common_prefix_len > 0
+
         if use_cascade:
-            # TODO: Optimize.
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
                                                 device=self.runner.device)
@@ -131,10 +371,31 @@ class FlashAttentionMetadataBuilder:
                               common_prefix_len)
             suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
                 self.runner.device)
+            prefix_scheduler_metadata = schedule(
+                batch_size=num_reqs,
+                cu_query_lens=cu_prefix_query_lens,
+                max_query_len=num_actual_tokens,
+                seqlens=prefix_kv_lens,
+                max_seq_len=common_prefix_len,
+                causal=False)
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=suffix_kv_lens,
+                                          max_seq_len=max_seq_len -
+                                          common_prefix_len,
+                                          causal=True)
         else:
             cu_prefix_query_lens = None
             prefix_kv_lens = None
             suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=seq_lens,
+                                          max_seq_len=max_seq_len,
+                                          causal=True)
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -146,12 +407,18 @@ class FlashAttentionMetadataBuilder:
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
+            scheduler_metadata=scheduler_metadata,
             cu_prefix_query_lens=cu_prefix_query_lens,
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
+            local_attn_metadata=local_attn_metadata,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
         )
         return attn_metadata
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
 
 class FlashAttentionImpl(AttentionImpl):
 
@@ -167,6 +434,7 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -203,6 +471,7 @@ class FlashAttentionImpl(AttentionImpl):
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
+        self.use_irope = use_irope
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
             and not flash_attn_supports_fp8():
@@ -265,8 +534,7 @@ class FlashAttentionImpl(AttentionImpl):
             layer._k_scale,
             layer._v_scale,
         )
-        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1,
-                         key.shape[1])
+
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fn)
             value_cache = value_cache.view(torch.float8_e4m3fn)
@@ -278,23 +546,45 @@ class FlashAttentionImpl(AttentionImpl):
             query = query.reshape((num_tokens, num_heads, head_size))
 
         # Compute attention and update output up to `num_actual_tokens`.
-        if not attn_metadata.use_cascade:
-            # Regular attention (common case).
+        use_local_attn = \
+            (self.use_irope and attn_metadata.local_attn_metadata is not None)
+
+        if not attn_metadata.use_cascade or use_local_attn:
+            if use_local_attn:
+                assert attn_metadata.local_attn_metadata is not None
+                local_metadata = attn_metadata.local_attn_metadata
+                cu_seqlens_q = local_metadata.local_query_start_loc
+                seqused_k = local_metadata.local_seqused_k
+                max_seqlen_q = local_metadata.local_max_query_len
+                max_seqlen_k = local_metadata.local_max_seq_len
+                block_table = local_metadata.local_block_table
+                scheduler_metadata = local_metadata.local_scheduler_metadata
+            else:
+                cu_seqlens_q = attn_metadata.query_start_loc
+                seqused_k = attn_metadata.seq_lens
+                max_seqlen_q = attn_metadata.max_query_len
+                max_seqlen_k = attn_metadata.max_seq_len
+                block_table = attn_metadata.block_table
+                scheduler_metadata = attn_metadata.scheduler_metadata
+
+            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
             flash_attn_varlen_func(
                 q=query[:num_actual_tokens],
                 k=key_cache,
                 v=value_cache,
                 out=output[:num_actual_tokens],
-                cu_seqlens_q=attn_metadata.query_start_loc,
-                max_seqlen_q=attn_metadata.max_query_len,
-                seqused_k=attn_metadata.seq_lens,
-                max_seqlen_k=attn_metadata.max_seq_len,
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+                seqused_k=seqused_k,
+                max_seqlen_k=max_seqlen_k,
                 softmax_scale=self.scale,
                 causal=True,
                 alibi_slopes=self.alibi_slopes,
                 window_size=self.sliding_window,
-                block_table=attn_metadata.block_table,
+                block_table=block_table,
                 softcap=self.logits_soft_cap,
+                scheduler_metadata=scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),
@@ -302,6 +592,8 @@ class FlashAttentionImpl(AttentionImpl):
             )
             return output
 
+        assert not use_local_attn, (
+            "Cascade attention does not support local attention.")
         # Cascade attention (rare case).
         cascade_attention(
             output[:num_actual_tokens],
@@ -321,6 +613,8 @@ class FlashAttentionImpl(AttentionImpl):
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
             fa_version=self.vllm_flash_attn_version,
+            prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
+            suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
             q_descale=layer._q_scale,
             k_descale=layer._k_scale,
             v_descale=layer._v_scale,
@@ -414,6 +708,8 @@ def cascade_attention(
     block_table: torch.Tensor,
     common_prefix_len: int,
     fa_version: int,
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
+    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
@@ -445,6 +741,7 @@ def cascade_attention(
         block_table=block_table[:1],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=prefix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
@@ -471,6 +768,7 @@ def cascade_attention(
         block_table=block_table[:, num_common_kv_blocks:],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=suffix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
new file mode 100755
index 0000000000000..17341ecfa4fe7
--- /dev/null
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashInfer."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
+                        MultiLevelCascadeAttentionWrapper)
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionType)
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.flash_attn import use_cascade_attention
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+
+logger = init_logger(__name__)
+
+
+class FlashInferBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [64, 128, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type[FlashInferImpl]:
+        return FlashInferImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type[FlashInferMetadata]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+        return FlashInferMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
+@dataclass
+class FlashInferMetadata:
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    qo_indptr: torch.Tensor
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: torch.Tensor
+    # The page indices of the paged kv cache
+    paged_kv_indices: torch.Tensor
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: torch.Tensor
+    # The number of query/output heads
+    num_qo_heads: int
+    # The number of key/value heads
+    num_kv_heads: int
+    # The dimension of the attention heads
+    head_dim: int
+    # Block size of vllm
+    page_size: int
+    # The data type of the paged kv cache
+    data_type: torch.dtype
+    # The data type of the query
+    q_data_type: torch.dtype
+
+    slot_mapping: torch.Tensor
+
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+
+    # For cascade attention.
+    use_cascade: bool
+    shared_qo_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indices: Optional[torch.Tensor] = None
+    shared_kv_last_page_len: Optional[torch.Tensor] = None
+
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    @property
+    def query_start_loc(self):
+        # The GPUModelRunner expects to be able to access this property.
+        return self.qo_indptr
+
+    def __post_init__(self):
+        # Refer to
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f" received {self.head_dim}.")
+
+
+class FlashInferMetadataBuilder:
+
+    def __init__(self, runner: GPUModelRunner):
+        self.runner = runner
+        self._workspace_buffer = None
+        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._decode_wrapper = None  # Wrapper for decode
+        self._cascade_wrapper = None  # Wrapper for cascade attention
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        # We now want to reorder the batch so that the "decode" requests are and
+        # the front and the "prefill" requests are at the using the least amount
+        # swaps possible. (NOTE for now we loosely use "decode" to mean requests
+        # where attention is likely memory-bound and "prefill" to mean requests
+        # where attention is likely compute-bound, TODO(lucas): figure out a
+        # better naming here)
+        decodes = []
+        prefills = []
+        num_decode_tokens = 0
+        num_prefill_tokens = 0
+
+        for i, req_id in enumerate(input_batch.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            # for now treat 1 scheduled token as "decode" even if its not,
+            # we should update this to something like < 8 in the future but
+            # currently the decode run only supports num_tokens = 1
+            if num_tokens == 1:
+                decodes.append(i)
+                num_decode_tokens += num_tokens
+            else:
+                prefills.append(i)
+                num_prefill_tokens += num_tokens
+
+        # We hope that this is fairly minimal since decodes
+        # should be around for a number of iterations so hopefully they are
+        # relatively stationary (and new request are generally appended to the
+        # persistent batch so already should be at the back)
+        # To achieve this we loop over the decodes in descending order and
+        # the prefills in ascending order. We swap decodes from the  "back"
+        # i.e. past where the last decode should be in the reodorered with
+        # prefills from the front of the batch.
+        # `decodes` and `prefills` are already in ascending order just based on
+        # the above loop
+        num_decodes = len(decodes)
+        num_prefills = len(prefills)
+        modified_batch = False
+
+        for i in range(1, min(num_decodes, num_prefills) + 1):
+            # If the decode is at the "back" of the batch, i, we can swap it
+            # with the prefill closest to the front of the batch
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
+                break
+
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
+
+        # Save for next `build` call
+        # TODO(lucas): this is a bit of a hack, we should probably have a
+        # better way of doing this
+        self._num_decodes = num_decodes
+        self._num_prefills = num_prefills
+        self._num_decode_tokens = num_decode_tokens
+        self._num_prefill_tokens = num_prefill_tokens
+
+        return modified_batch
+
+    def _get_workspace_buffer(self):
+        if self._workspace_buffer is None:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=self.runner.device)
+        return self._workspace_buffer
+
+    def _get_prefill_wrapper(self):
+        if self._prefill_wrapper is None:
+            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(), "NHD")
+        return self._prefill_wrapper
+
+    def _get_decode_wrapper(self):
+        if self._decode_wrapper is None:
+            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config))
+            num_kv_heads = self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config)
+            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+                num_qo_heads // num_kv_heads > 4)
+            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(),
+                "NHD",
+                use_tensor_cores=use_tensor_cores)
+        return self._decode_wrapper
+
+    def _get_cascade_wrapper(self):
+        if self._cascade_wrapper is None:
+            self._cascade_wrapper = MultiLevelCascadeAttentionWrapper(
+                2, self._get_workspace_buffer(), "NHD")
+        return self._cascade_wrapper
+
+    def _plan(self, attn_metadata: FlashInferMetadata):
+        if self.global_hyperparameters is None:
+            self.global_hyperparameters = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [attn_metadata.shared_qo_indptr, attn_metadata.qo_indptr],
+                [
+                    attn_metadata.shared_kv_page_indptr,
+                    attn_metadata.paged_kv_indptr
+                ],
+                [
+                    attn_metadata.shared_kv_page_indices,
+                    attn_metadata.paged_kv_indices
+                ],
+                [
+                    attn_metadata.shared_kv_last_page_len,
+                    attn_metadata.paged_kv_last_page_len
+                ],
+                attn_metadata.num_qo_heads,
+                attn_metadata.num_kv_heads,
+                attn_metadata.head_dim,
+                attn_metadata.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=attn_metadata.q_data_type,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            if self._num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = self._num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert attn_metadata.qo_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_last_page_len[
+                    prefill_start:].shape[0] == self._num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr = attn_metadata.qo_indptr[
+                    prefill_start:] - attn_metadata.qo_indptr[prefill_start]
+                attn_metadata.prefill_wrapper.plan(
+                    qo_indptr,
+                    attn_metadata.paged_kv_indptr[prefill_start:],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[prefill_start:],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    causal=True,
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+            if self._num_decodes > 0:
+                attn_metadata.decode_wrapper = self._get_decode_wrapper()
+                attn_metadata.decode_wrapper.plan(
+                    attn_metadata.paged_kv_indptr[:self._num_decodes + 1],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[:self._num_decodes],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        assert self._num_decodes + self._num_prefills == num_reqs
+        assert (self._num_decode_tokens +
+                self._num_prefill_tokens == num_actual_tokens)
+        page_size = self.runner.block_size
+        device = self.runner.device
+        qo_indptr = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            self.runner.device, non_blocking=True)
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
+                                                          non_blocking=True)
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            self.runner.device, non_blocking=True).long()
+
+        block_table_bounds = (seq_lens + page_size - 1) // page_size
+
+        use_cascade = common_prefix_len > 0
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            shared_qo_indptr = torch.tensor([0, num_actual_tokens],
+                                            dtype=torch.int32,
+                                            device=device)
+            shared_kv_page_indptr = torch.tensor([0, num_common_kv_blocks],
+                                                 dtype=torch.int32,
+                                                 device=device)
+            shared_kv_page_indices = block_table[0, :num_common_kv_blocks]
+            shared_kv_last_page_len = torch.tensor([page_size],
+                                                   dtype=torch.int32,
+                                                   device=device)
+            # Remove the blocks of the shared prefix from all requests.
+            block_table = block_table[:, num_common_kv_blocks:]
+            block_table_bounds -= num_common_kv_blocks
+        else:
+            shared_qo_indptr = None
+            shared_kv_page_indptr = None
+            shared_kv_page_indices = None
+            shared_kv_last_page_len = None
+
+        mask = (torch.arange(block_table.size(1),
+                             dtype=block_table.dtype,
+                             device=block_table.device).unsqueeze(0)
+                < block_table_bounds.unsqueeze(1))
+        paged_kv_indices = block_table[mask]
+
+        paged_kv_indptr = torch.cat([
+            torch.zeros(1,
+                        dtype=block_table_bounds.dtype,
+                        device=block_table_bounds.device),
+            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
+        ])
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
+
+        attn_metadata = FlashInferMetadata(
+            num_actual_tokens=num_actual_tokens,
+            qo_indptr=qo_indptr,
+            paged_kv_indptr=paged_kv_indptr,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len,
+            num_qo_heads=self.runner.num_query_heads,
+            num_kv_heads=self.runner.num_kv_heads,
+            head_dim=self.runner.head_size,
+            page_size=page_size,
+            data_type=self.runner.kv_cache_dtype,
+            q_data_type=self.runner.dtype,
+            slot_mapping=slot_mapping,
+            num_decodes=self._num_decodes,
+            num_decode_tokens=self._num_decode_tokens,
+            num_prefills=self._num_prefills,
+            num_prefill_tokens=self._num_prefill_tokens,
+            use_cascade=use_cascade,
+            shared_qo_indptr=shared_qo_indptr,
+            shared_kv_page_indptr=shared_kv_page_indptr,
+            shared_kv_page_indices=shared_kv_page_indices,
+            shared_kv_last_page_len=shared_kv_last_page_len,
+        )
+
+        self._plan(attn_metadata)
+
+        return attn_metadata
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        if self.runner.kv_cache_dtype != self.runner.model_config.dtype:
+            # TODO: The cascade wrapper currently does not support setting
+            # kv cache dtype to something different from query dtype.
+            return False
+        return use_cascade_attention(*args, **kwargs)
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashInferMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashInfer.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [num_blocks, 2, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        # Reshape the input keys and values and store them in the cache.
+        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+        # not padded. However, we don't need to do key[:num_actual_tokens] and
+        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
+        # the slot_mapping's shape to determine the number of actual tokens.
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            kv_cache[:, 0],
+            kv_cache[:, 1],
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        window_left = (self.sliding_window[0]
+                       if self.sliding_window is not None else -1)
+
+        # Inputs and outputs may be padded for CUDA graphs
+        query = query[:num_actual_tokens]
+        output_padded = output
+        output = output[:num_actual_tokens]
+
+        if attn_metadata.use_cascade:
+            # Cascade attention (rare case).
+            assert attn_metadata.cascade_wrapper is not None
+            output.copy_(attn_metadata.cascade_wrapper.run(query, kv_cache))
+            return output
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+
+        # Regular attention (common case).
+        # Decodes are at the front and prefills are at the back,
+        # according to reorder_batch()
+        if prefill_wrapper := attn_metadata.prefill_wrapper:
+            prefill_query = query[num_decode_tokens:]
+            assert prefill_query.shape[0] == num_prefill_tokens
+            assert prefill_wrapper is not None
+            assert prefill_wrapper._causal
+            assert prefill_wrapper._window_left == window_left
+            assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                        or 0.0)
+            assert prefill_wrapper._sm_scale == self.scale
+            prefill_wrapper.run(
+                prefill_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[num_decode_tokens:],
+            )
+
+        if decode_wrapper := attn_metadata.decode_wrapper:
+            decode_query = query[:num_decode_tokens]
+            assert decode_query.shape[0] == num_decode_tokens
+            assert decode_wrapper is not None
+            assert decode_wrapper._window_left == window_left
+            assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                       or 0.0)
+            assert decode_wrapper._sm_scale == self.scale
+            decode_wrapper.run(
+                decode_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[:num_decode_tokens],
+            )
+
+        return output_padded
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1437db7e9d485..f826f8a21789e 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O
 
 NOTE: in the actual code,
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
     `out_proj` is W_O
 
 
@@ -195,7 +195,8 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.attention.backends.utils import get_mla_dims
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
@@ -207,9 +208,11 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -248,10 +251,6 @@ class MLACommonBackend(AttentionBackend):
     def get_supported_head_sizes() -> list[int]:
         return [576]
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
 
 @dataclass
 class MLACommonPrefillMetadata:
@@ -350,6 +349,14 @@ class MLACommonMetadataBuilder(Generic[M]):
         model_config = runner.model_config
         cache_config = runner.cache_config
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.num_heads = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.mla_dims = get_mla_dims(model_config)
+        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+
+        # Dont try to access the runner on AMD
+        if self.aot_schedule:
+            self.page_size = self.runner.block_size
 
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
@@ -375,7 +382,6 @@ class MLACommonMetadataBuilder(Generic[M]):
                 dtype=model_config.dtype,
                 device=runner.device,
             )
-            self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -415,20 +421,18 @@ class MLACommonMetadataBuilder(Generic[M]):
         # the above loop
         num_decodes = len(decodes)
         num_prefills = len(prefills)
-        first_prefill = 0
         modified_batch = False
 
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
             # with the prefill closest to the front of the batch
-            if decodes[num_decodes - i] >= num_decodes:
-                input_batch.swap_states(prefills[first_prefill],
-                                        decodes[num_decodes - i])
-                first_prefill += 1
-                modified_batch = True
-            else:
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
                 break
 
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
+
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
         # better way of doing this
@@ -466,7 +470,6 @@ class MLACommonMetadataBuilder(Generic[M]):
 
         seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
         seq_lens = seq_lens_cpu.to(device, non_blocking=True)
-        max_query_len = seq_lens_cpu.max().item()
 
         prefill_metadata = None
         if self._num_prefills > 0:
@@ -477,6 +480,8 @@ class MLACommonMetadataBuilder(Generic[M]):
                 num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
 
             chunked_context_metadata = None
             if self.chunked_prefill_enabled and self._num_prefills > 0 \
@@ -539,8 +544,7 @@ class MLACommonMetadataBuilder(Generic[M]):
             prefill_metadata = MLACommonPrefillMetadata(
                 input_positions=input_positions[tokens_start:],
                 block_table=block_table[reqs_start:, ...],
-                query_start_loc=query_start_loc[reqs_start:] -
-                query_start_loc[reqs_start],
+                query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
             )
@@ -566,6 +570,9 @@ class MLACommonMetadataBuilder(Generic[M]):
             decode=decode_metadata,
         )
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
@@ -630,11 +637,56 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
         self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
         if self.vllm_flash_attn_version is not None:
             self.flash_attn_varlen_func = \
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there is multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
@@ -747,16 +799,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
                 q=q,
                 k=k,
-                v=v_padded,
+                v=v,
                 cu_seqlens_q=prefill_metadata.query_start_loc,
                 cu_seqlens_k=prefill_metadata.chunked_context.cu_seq_lens[i],
                 max_seqlen_q=prefill_metadata.max_query_len,
@@ -803,15 +850,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        output = self.flash_attn_varlen_func(
+        output = self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
-            v=v_padded,
+            v=v,
             cu_seqlens_q=attn_metadata.prefill.query_start_loc,
             cu_seqlens_k=attn_metadata.prefill.query_start_loc,
             max_seqlen_q=attn_metadata.prefill.max_query_len,
@@ -835,12 +877,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return self.o_proj(output.flatten(start_dim=-2))[0]
 
     @abstractmethod
     def _forward_decode(
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 14d3664db0d64..05b97172bc6c0 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -10,10 +10,11 @@ import torch_xla.experimental.custom_kernel  # noqa: F401
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import cdiv
 
-# These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+logger = init_logger(__name__)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -41,7 +42,7 @@ class PallasAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads * head_size)
+        return (num_blocks, block_size, num_kv_heads * 2, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -51,6 +52,19 @@ class PallasAttentionBackend(AttentionBackend):
     ) -> None:
         raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
+    # In recent TPU generations, up to v6e, the SMEM size is 1MB. The
+    # block_tables within the PallasMetadata constitute almost the entire SMEM
+    # requirement. Its size is max_num_seqs * num_page_per_seq * 4 (Int). Here
+    # we simply make sure that the size is smaller than half of SMEM capacity.
+    @staticmethod
+    def get_min_page_size(vllm_config: VllmConfig) -> int:
+        max_num_page_per_req = (1024 * 1024 // 2 //
+                                vllm_config.scheduler_config.max_num_seqs // 4)
+        min_page_size = cdiv(vllm_config.model_config.max_model_len,
+                             max_num_page_per_req)
+        min_page_size = 1 << (min_page_size - 1).bit_length()
+        return min_page_size
+
 
 @dataclass
 class PallasMetadata:
@@ -84,7 +98,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Pallas is not supported yet, it will fall back "
+                "to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError("Paged attention Pallas kernel does "
                              "not support block-sparse attention.")
@@ -92,6 +111,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
         self.head_size = head_size
         self.scale = float(scale)
         self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -99,15 +120,10 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
             raise NotImplementedError("Alibi slopes is not supported.")
-        if sliding_window is not None:
-            raise NotImplementedError("Sliding window is not supported.")
         if kv_cache_dtype != "auto":
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
@@ -118,13 +134,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
         tpu_version = torch_xla.tpu.version()
         if tpu_version < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
-        # NOTE(chengjiyao): the TPU v4's vmem capacity is 16MB
-        # TODO(chengjiyao): autotune NUM_QUERIES_PER_BLOCK,
-        # NUM_KV_PAGES_PER_BLOCK and vmem_limit_bytes
-        if tpu_version == 4:
-            self.vmem_limit_bytes = 16 * 1024 * 1024
-        else:
-            self.vmem_limit_bytes = 64 * 1024 * 1024
 
     def forward(
         self,
@@ -132,7 +141,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: tuple[torch.Tensor, torch.Tensor],
+        kv_cache: torch.Tensor,
         attn_metadata: PallasMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -142,14 +151,13 @@ class PallasAttentionBackendImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_blocks, block_size, num_kv_heads * head_size], 
-                        [num_blocks, block_size, num_kv_heads * head_size])
+            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
         # For determine_available_memory case.
-        if kv_cache[0].numel() == 0:
+        if kv_cache.numel() == 0:
             if output is None:
                 output = torch.ones_like(query)
             return output
@@ -158,24 +166,28 @@ class PallasAttentionBackendImpl(AttentionImpl):
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        key_cache, value_cache = kv_cache
-        if kv_cache[0].numel() > 0:
+        if kv_cache.numel() > 0:
             slot_mapping = attn_metadata.slot_mapping
-            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+            write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
-            key_cache,
-            value_cache,
+            kv_cache,
             attn_metadata.context_lens,
             attn_metadata.block_tables,
             attn_metadata.query_start_loc,
             attn_metadata.num_seqs,
-            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
-            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            vmem_limit_bytes=self.vmem_limit_bytes,
+            # By default, the system utilizes optimized block size and
+            # vmem_limit_bytes parameters from the kernel repository. However,
+            # these can be manually adjusted for debugging if necessary.
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
             use_kernel=True,
-            sm_scale=self.scale)
+            sm_scale=self.scale,
+            sliding_window=self.sliding_window,
+            soft_cap=self.logits_soft_cap,
+        )
 
         return output.reshape(num_tokens, hidden_size)
 
@@ -183,23 +195,27 @@ class PallasAttentionBackendImpl(AttentionImpl):
 def write_to_kv_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """ Write the key and values to the KV cache.
 
     Args:
         key: shape = [num_tokens, num_kv_heads * head_size]
-        value: shape = [num_tokens, num_kv_heads * head_size]
-        k_cache = [num_blocks, block_size, num_kv_heads * head_size]
-        v_cache = [num_blocks, block_size, num_kv_heads * head_size]
+        value: shape = [num_tokens, num_kv_heads *  head_size]
+        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
 
     """
-    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+    _, _, num_combined_kv_heads, head_size = kv_cache.shape
+    num_kv_heads = num_combined_kv_heads // 2
 
-    key_cache = key_cache.flatten(0, 1)
-    value_cache = value_cache.flatten(0, 1)
-    key_cache.index_copy_(0, slot_mapping, key)
-    value_cache.index_copy_(0, slot_mapping, value)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+
+    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
+                                                  head_size)
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
+
+    kv_cache = kv_cache.flatten(0, 1)
+    kv_cache.index_copy_(0, slot_mapping, kv)
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index f11f2b6271ff8..5f96104705675 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -70,6 +70,7 @@ class TritonAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -86,6 +87,7 @@ class TritonAttentionImpl(AttentionImpl):
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
+        self.use_irope = use_irope
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -156,23 +158,41 @@ class TritonAttentionImpl(AttentionImpl):
             layer._v_scale,
         )
 
+        use_local_attn = \
+            (self.use_irope and attn_metadata.local_attn_metadata is not None)
+
+        if use_local_attn:
+            assert attn_metadata.local_attn_metadata is not None
+            local_metadata = attn_metadata.local_attn_metadata
+            cu_seqlens_q = local_metadata.local_query_start_loc
+            sequesd_k = local_metadata.local_seqused_k
+            max_seqlen_q = local_metadata.local_max_query_len
+            max_seqlen_k = local_metadata.local_max_seq_len
+            block_table = local_metadata.local_block_table
+        else:
+            cu_seqlens_q = attn_metadata.query_start_loc
+            sequesd_k = attn_metadata.seq_lens
+            max_seqlen_q = attn_metadata.max_query_len
+            max_seqlen_k = attn_metadata.max_seq_len
+            block_table = attn_metadata.block_table
+
         # Compute attention and update output up to `num_actual_tokens`.
-        chunked_prefill_paged_decode(
-            query=query[:num_actual_tokens],
-            key=key[:num_actual_tokens],
-            value=value[:num_actual_tokens],
-            output=output[:num_actual_tokens],
-            kv_cache_dtype=self.kv_cache_dtype,
-            key_cache=key_cache,
-            value_cache=value_cache,
-            block_table=attn_metadata.block_table,
-            query_start_loc=attn_metadata.query_start_loc,
-            seq_lens=attn_metadata.seq_lens,
-            max_query_len=attn_metadata.max_query_len,
-            k_scale=layer._k_scale,
-            v_scale=layer._v_scale,
-            alibi_slopes=self.alibi_slopes,
-            sliding_window=self.sliding_window[0],
-            sm_scale=self.scale)
+        chunked_prefill_paged_decode(query=query[:num_actual_tokens],
+                                     key=key[:num_actual_tokens],
+                                     value=value[:num_actual_tokens],
+                                     output=output[:num_actual_tokens],
+                                     kv_cache_dtype=self.kv_cache_dtype,
+                                     key_cache=key_cache,
+                                     value_cache=value_cache,
+                                     block_table=block_table,
+                                     query_start_loc=cu_seqlens_q,
+                                     seq_lens=sequesd_k,
+                                     max_seq_len=max_seqlen_k,
+                                     max_query_len=max_seqlen_q,
+                                     k_scale=layer._k_scale,
+                                     v_scale=layer._v_scale,
+                                     alibi_slopes=self.alibi_slopes,
+                                     sliding_window=self.sliding_window[0],
+                                     sm_scale=self.scale)
 
         return output
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 394b47fddf0c9..74f3f7852c9a9 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Optional
+from typing import Callable, Optional
 
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
@@ -15,10 +15,10 @@ logger = init_logger(__name__)
 
 class BlockPool:
     """BlockPool that manages KVCacheBlocks.
-    It provides methods to allocate, free and cache the kv cache blocks. The 
-    free_block_queue stores the free blocks in eviction order to enable 
-    allocation, free, and cache eviction. The cached_block_hash_to_block 
-    maps between block hash and cached block to support finding cached blocks 
+    It provides methods to allocate, free and cache the kv cache blocks. The
+    free_block_queue stores the free blocks in eviction order to enable
+    allocation, free, and cache eviction. The cached_block_hash_to_block
+    maps between block hash and cached block to support finding cached blocks
     by their block hash.
 
     Args:
@@ -27,6 +27,7 @@ class BlockPool:
     """
 
     def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
         # All kv-cache blocks.
@@ -50,6 +51,11 @@ class BlockPool:
         self.cached_block_hash_to_block: dict[BlockHashType, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
+        # To represent a placeholder block with block_id=0.
+        # The ref_cnt of null_block is not maintained, needs special care to
+        # avoid freeing it.
+        self.null_block = self.free_block_queue.popleft()
+
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
@@ -61,11 +67,11 @@ class BlockPool:
         Returns:
             The cached block if it exists, or None.
         """
-        if block_hash in self.cached_block_hash_to_block:
-            first_block_id = list(
-                self.cached_block_hash_to_block[block_hash].keys())[0]
-            return self.cached_block_hash_to_block[block_hash][first_block_id]
-        return None
+        cached_blocks = self.cached_block_hash_to_block.get(block_hash)
+        if not cached_blocks:
+            return None
+        first_block_id = next(iter(cached_blocks))
+        return cached_blocks[first_block_id]
 
     def cache_full_blocks(
         self,
@@ -75,11 +81,12 @@ class BlockPool:
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
         This function takes a list of blocks that will have their block hash
         metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `num_cached_blocks` to 
+        block hashes for the blocks starting from `num_cached_blocks` to
         `num_full_blocks`, updating the metadata for each block
         and caching them in the `cached_block_hash_to_block`.
 
@@ -87,12 +94,13 @@ class BlockPool:
             request: The request to cache the blocks.
             blocks: All blocks in the request.
             block_hashes: Block hashes of the blocks in the request. Note that
-            this list may be shorter than the blocks list. In this case the 
+            this list may be shorter than the blocks list. In this case the
             missed block hash will be computed in this function.
             num_cached_blocks: The number of blocks that are already cached.
-            num_full_blocks: The number of blocks that are full and should 
+            num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
             return
@@ -138,7 +146,7 @@ class BlockPool:
                     request, start_token_idx, end_token_idx, -1)
 
                 # Compute the hash of the current block.
-                block_hash = hash_block_tokens(prev_block_hash_value,
+                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
                                                block_tokens, extra_keys)
                 block_hashes.append(block_hash)
 
@@ -212,7 +220,7 @@ class BlockPool:
         for block in blocks:
             # ref_cnt=0 means this block is in the free list (i.e. eviction
             # candidate), so remove it.
-            if block.ref_cnt == 0:
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.remove(block)
             block.incr_ref()
 
@@ -226,7 +234,8 @@ class BlockPool:
         """
         for block in ordered_blocks:
             block.decr_ref()
-            if block.ref_cnt == 0:
+            # null_block should not be added to the free list.
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
@@ -239,10 +248,10 @@ class BlockPool:
             False otherwise.
         """
         num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
-        if num_used_blocks > 0:
+        if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks)
+                "blocks (%d) are not freed yet", num_used_blocks - 1)
             return False
 
         # Remove all hashes so that no new blocks will hit.
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 018379c1f43af..05d70bb9b9773 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
@@ -67,6 +67,7 @@ class EncoderCacheManager:
 def compute_encoder_budget(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations.
@@ -74,6 +75,7 @@ def compute_encoder_budget(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -89,7 +91,11 @@ def compute_encoder_budget(
     (
         encoder_compute_budget,
         encoder_cache_size,
-    ) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
+    ) = _compute_encoder_budget_multimodal(
+        model_config,
+        scheduler_config,
+        mm_registry,
+    )
 
     return encoder_compute_budget, encoder_cache_size
 
@@ -97,6 +103,7 @@ def compute_encoder_budget(
 def _compute_encoder_budget_multimodal(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations for a multimodal model.
@@ -104,6 +111,7 @@ def _compute_encoder_budget_multimodal(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -112,8 +120,8 @@ def _compute_encoder_budget_multimodal(
             in the input sequence.
     """
 
-    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
-        model_config)
+    max_tokens_by_modality_dict = mm_registry \
+        .get_max_tokens_per_item_by_nonzero_modality(model_config)
 
     if not max_tokens_by_modality_dict:
         logger.warning(
@@ -125,6 +133,14 @@ def _compute_encoder_budget_multimodal(
     _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
                                     key=lambda item: item[1])
 
+    if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
+            > scheduler_config.max_num_batched_tokens):
+        raise ValueError(
+            "Chunked MM input disabled but max_tokens_per_mm_item "
+            f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
+            f" ({scheduler_config.max_num_batched_tokens}). Please increase "
+            "max_num_batched_tokens.")
+
     encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
                                  max_tokens_per_mm_item)
     encoder_cache_size = max(scheduler_config.encoder_cache_size,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 5cfe2b96865a2..354300d3c2fe1 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -5,10 +5,12 @@ from collections.abc import Iterable
 from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
+from vllm.v1.core.specialized_manager import get_specialized_manager
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
 
@@ -19,36 +21,32 @@ class KVCacheManager:
 
     def __init__(
         self,
-        block_size: int,
-        num_gpu_blocks: int,
+        kv_cache_config: KVCacheConfig,
         max_model_len: int,
-        sliding_window: Optional[int] = None,
         enable_caching: bool = True,
-        num_preallocate_tokens: int = 64,
+        caching_hash_algo: str = "builtin",
         log_stats: bool = False,
     ) -> None:
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
+        assert len(kv_cache_config.kv_cache_groups) == 1, (
+            "KVCacheManager does not support hybrid models with more than 1 "
+            "kv cache group")
+        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
-        self.sliding_window = sliding_window
-        self.enable_caching = enable_caching
-        # FIXME: make prefix cache stats conditional on log_stats
-        self.log_stats = log_stats
-        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
-        # blocks for each request. For example, when a request reaches the end
-        # of its block table, we preallocate N blocks in advance. This way, we
-        # reduce the overhead of updating free_block_ids and ref_cnts for each
-        # request every step (at the cost of some memory waste).
-        # NOTE(woosuk): This is different from the "lookahead" slots since this
-        # does not guarantee that the request always has N empty blocks. After
-        # the request gets N empty blocks, it starts to use the blocks without
-        # further allocation. When it uses up all the N empty blocks, it gets
-        # N new empty blocks.
-        self.num_preallocate_tokens = num_preallocate_tokens
-        self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
+        self.max_num_blocks_per_req = cdiv(max_model_len, self.block_size)
 
-        self.block_pool = BlockPool(num_gpu_blocks, enable_caching)
+        self.enable_caching = enable_caching
+        self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
+        self.log_stats = log_stats
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
+
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
+        self.specialized_manager = get_specialized_manager(
+            kv_cache_spec=kv_cache_spec,
+            block_pool=self.block_pool,
+        )
 
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
@@ -67,7 +65,6 @@ class KVCacheManager:
         # This is only used to track the RUNNING requests, we do not track the
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
-        self.prefix_cache_stats = PrefixCacheStats()
 
     @property
     def usage(self) -> float:
@@ -78,12 +75,14 @@ class KVCacheManager:
         """
         return self.block_pool.get_usage()
 
-    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
         """Get (and reset) the prefix cache stats.
 
         Returns:
-            The current prefix caching stats.
+            The current prefix caching stats, or None if logging is disabled.
         """
+        if not self.log_stats:
+            return None
         stats = self.prefix_cache_stats
         self.prefix_cache_stats = PrefixCacheStats()
         return stats
@@ -109,49 +108,68 @@ class KVCacheManager:
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
         if not block_hashes:
-            block_hashes = hash_request_tokens(self.block_size, request)
+            block_hashes = hash_request_tokens(self.caching_hash_fn,
+                                               self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        self.prefix_cache_stats.requests += 1
-        if request.sampling_params.prompt_logprobs is None:
-            # Check for cache hits
-            computed_blocks = []
-            for block_hash in block_hashes:
-                # block_hashes is a chain of block hashes. If a block hash
-                # is not in the cached_block_hash_to_id, the following
-                # block hashes are not computed yet for sure.
-                if cached_block := self.block_pool.get_cached_block(
-                        block_hash):
-                    computed_blocks.append(cached_block)
-                else:
-                    break
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
+        # When the request requires prompt logprobs, we skip prefix caching.
+        if request.sampling_params.prompt_logprobs is not None:
+            return [], 0
 
+        if len(block_hashes) * self.block_size == request.num_tokens:
+            # When prompt length is divisible by the block size and all
+            # blocks are cached, we need to recompute the last token. This
+            # have to be achieved by re-computing an entire block because
+            # allocate_slots() assumes num_computed_tokens is always a
+            # multiple of the block size. To achieve this, remove the last
+            # block hash from the block_hashes for find_longest_cache_hit
+            # This limitation can potentially be removed in the future to
+            # slightly improve the performance.
+            last_block_hash = block_hashes.pop()
+        else:
+            last_block_hash = None
+
+        computed_blocks = (
+            self.specialized_manager.find_longest_cache_hit(block_hashes))
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.queries += len(block_hashes)
             self.prefix_cache_stats.hits += len(computed_blocks)
 
-            # NOTE(woosuk): Since incomplete blocks are not eligible for
-            # sharing, `num_computed_tokens` is always a multiple of
-            # `block_size`.
-            num_computed_tokens = len(computed_blocks) * self.block_size
-            return computed_blocks, num_computed_tokens
-        else:
-            # Skip cache hits for prompt logprobs
-            return [], 0
+        if last_block_hash is not None:
+            # Add back the last block hash if it was removed.
+            # NOTE: Because block_hashes is cached in req_to_block_hashes,
+            # we shouldn't modify it directly.
+            block_hashes.append(last_block_hash)
+
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size
+        return computed_blocks, num_computed_tokens
 
     def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
-        new_computed_blocks: Optional[list[KVCacheBlock]] = None
+        new_computed_blocks: Optional[list[KVCacheBlock]] = None,
+        num_lookahead_tokens: int = 0,
     ) -> Optional[list[KVCacheBlock]]:
         """Add slots for a request with new tokens to append.
 
         Args:
             request: The request to allocate slots.
-            num_tokens: The number of tokens to allocate. Note that this does
-                not include the tokens that have already been computed.
+            num_tokens: The number of tokens to allocate, including external
+                tokens. Note that this does not include tokens that have
+                already been computed locally (i.e. new_computed_blocks).
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
+            num_lookahead_tokens: The number of speculative tokens to allocate.
+                This is used by spec decode proposers with kv-cache such 
+                as eagle.
 
         Blocks layout:
         -----------------------------------------------------------------------
@@ -173,13 +191,25 @@ class KVCacheManager:
 
         new_computed_blocks = new_computed_blocks or []
 
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
+            req_blocks, request.num_computed_tokens)
+        self.block_pool.free_blocks(removed_blocks)
+
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
-        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
-                                   self.block_size)
-        req_blocks = self.req_to_blocks[request.request_id]
+        num_required_blocks = cdiv(
+            num_computed_tokens + num_tokens + num_lookahead_tokens,
+            self.block_size)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 
@@ -211,10 +241,9 @@ class KVCacheManager:
             # No new block is needed.
             new_blocks = []
         else:
-            # Get new blocks from the free block pool considering
-            # preallocated blocks.
+            # Get new blocks from the free block pool.
             num_new_blocks = min(
-                num_new_blocks + self.num_preallocate_blocks,
+                num_new_blocks,
                 self.block_pool.get_num_free_blocks(),
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
@@ -247,6 +276,7 @@ class KVCacheManager:
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks_after_append,
             block_size=self.block_size,
+            hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[
@@ -274,17 +304,19 @@ class KVCacheManager:
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
-        flows to invalid prefix caching after the weights are updated,
+        flows to invalidate prefix caching after the weights are updated,
         or used for resetting prefix caching status for benchmarking.
 
         Returns:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        if self.block_pool.reset_prefix_cache():
+        if not self.block_pool.reset_prefix_cache():
+            return False
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.reset = True
-            return True
-        return False
+        return True
 
     def get_num_common_prefix_blocks(
         self,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e0d7f4dbdc1c1..bd0e01d045d17 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
+import os
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, NamedTuple, Optional
+from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
-                                        KVCacheSpec, KVCacheTensor)
+from vllm.utils import GiB_bytes, sha256
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        KVCacheTensor, SlidingWindowSpec)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -18,9 +21,8 @@ logger = init_logger(__name__)
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
     We keep a tuple of token IDs and extra keys to reduce the likelihood of
-    hash collisions when the hash value is the same. But please note that 
-    hash collisions can still theoretically occur, albeit with an extremely 
-    low probability.
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
     """
     # Hash value of the block in an integer.
     hash_value: int
@@ -30,6 +32,20 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+# The hash seed for the first block of the prefix block sequence.
+#
+# Even if the hash function is the builtin hash(), we use sha256 to generate
+# the initial hash to simplify the code. This is not performance critical
+# as it is done one per process.
+#
+# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
+# variable if set such that processes can share the seed if needed.
+# This aligns with the behavior of Python's hash() function, which also uses
+# a random seed if PYTHONHASHSEED is not set.
+NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
+    'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
+
+
 class PrefixCachingMetrics:
     """Metrics for prefix caching with a hit rate of the most recent N requests.
 
@@ -148,7 +164,7 @@ class FreeKVCacheBlockQueue:
     builtin deque to support removing a block in the middle of the queue
     in O(1) time. To close the performance gap to the builtin deque which is
     implemented in C++, this class does not allocate any Python objects when
-    manipulating the linked list. Instead, this class manipulates the 
+    manipulating the linked list. Instead, this class manipulates the
     prev_free_block and next_free_block attributes of the given blocks.
 
     The queue is ordered by block ID in the beginning. When a block is allocated
@@ -178,7 +194,7 @@ class FreeKVCacheBlockQueue:
 
     def popleft(self) -> KVCacheBlock:
         """Pop the first free block and reduce num_free_blocks by 1.
-        
+
         Returns:
             The first free block.
         """
@@ -191,7 +207,7 @@ class FreeKVCacheBlockQueue:
 
     def remove(self, block: KVCacheBlock) -> None:
         """Remove a block in the free list and reduce num_free_blocks by 1.
-        
+
         Args:
             block: The block to remove.
         """
@@ -235,7 +251,7 @@ class FreeKVCacheBlockQueue:
 
     def get_all_free_blocks(self) -> list[KVCacheBlock]:
         """Get all free blocks in the free list. Mainly used for testing.
-        
+
         Returns:
             A list of free blocks.
         """
@@ -251,10 +267,10 @@ def need_extra_keys(request: Request) -> bool:
     """Check whether the blocks allocated to this request need extra hash keys.
 
     Args:
-        request (Request): The request. 
+        request (Request): The request.
 
     Returns:
-        bool: Whether blocks allocated to this request need extra hash keys. 
+        bool: Whether blocks allocated to this request need extra hash keys.
     """
 
     # Multimodal requests need to include the MM hash.
@@ -269,13 +285,13 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     computation. For multi-modal inputs, the extra keys are
     (mm_hash, start_offset) that indicate a mm input contained in the
     block and its starting offset in the block tokens.
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -294,8 +310,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1]["offset"] + mm_positions[-1][
-            "length"] < start_token_idx:
+    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -306,8 +321,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx]["offset"]
-        length = mm_positions[curr_mm_idx]["length"]
+        offset = mm_positions[curr_mm_idx].offset
+        length = mm_positions[curr_mm_idx].length
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
@@ -333,10 +348,10 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
 
 def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
     """Generate extra keys related to LoRA for block hash computation.
-    
+
     Args:
         request: The request object.
-    
+
     Returns:
         Return LoRA id of the request if it is a LoRA request. Return empty
         list otherwise.
@@ -351,13 +366,13 @@ def generate_block_hash_extra_keys(
         start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
     """Generate extra keys for the block hash. The extra keys can come from
     the multi-modal inputs and request specific metadata (e.g., LoRA ID).
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -375,6 +390,7 @@ def generate_block_hash_extra_keys(
 
 
 def hash_block_tokens(
+        hash_function: Callable,
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
         extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
@@ -395,21 +411,16 @@ def hash_block_tokens(
         The entire tuple is used as the hash key of the block.
     """
     if not parent_block_hash:
-        # Note that we use 'None' as a string here instead of None because
-        # as of Python 3.12, hash(None) returns a constant predictable value.
-        # This could possibly make it easier to find and exploit hash
-        # collisions. 'None' as a string will be hashed differently per process,
-        # but consistently within the same process. This is the same as the
-        # behavior of None prior to Python 3.12.
-        parent_block_hash = hash('None')
+        parent_block_hash = NONE_HASH
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHashType(
-        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        hash_function(
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
         curr_block_token_ids_tuple, extra_keys)
 
 
-def hash_request_tokens(block_size: int,
+def hash_request_tokens(hash_function: Any, block_size: int,
                         request: Request) -> list[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
@@ -441,18 +452,66 @@ def hash_request_tokens(block_size: int,
             req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start, end, curr_mm_idx)
 
-        block_hash = hash_block_tokens(parent_block_hash_value,
+        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
                                        block_token_ids, req_extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
 
 
+def estimate_max_model_len(vllm_config: VllmConfig,
+                           kv_cache_spec: dict[str, KVCacheSpec],
+                           available_memory: int) -> int:
+    """
+    Estimates the maximum model length that can fit in the available memory
+    using binary search.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The estimated maximum model length that can fit in the available memory.
+    """
+
+    # Define a function to check if a given model length fits in memory
+    def fits_in_memory(model_len: int) -> bool:
+        # Modify the max_model_len for this calculation
+        vllm_config.model_config.max_model_len = model_len
+        # Calculate memory needed for the given model length
+        memory_needed = sum(
+            (layer_spec.max_memory_usage_bytes(vllm_config)
+             for layer_spec in kv_cache_spec.values()),
+            start=0,
+        )
+        return memory_needed <= available_memory
+
+    # Binary search for the maximum model length
+    current_max = vllm_config.model_config.max_model_len
+    left, right = 1, current_max
+
+    # If even the smallest model length doesn't fit, return 0
+    if not fits_in_memory(left):
+        return 0
+
+    # Binary search for the maximum model length that fits
+    result = 1
+    while left <= right:
+        mid = (left + right) // 2
+        if fits_in_memory(mid):
+            result = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+    return result
+
+
 def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                  kv_cache_spec: dict[str, KVCacheSpec],
                                  available_memory: int):
     """
-    Checks whether `available_memory` is enough for the KV cache to hold at 
+    Checks whether `available_memory` is enough for the KV cache to hold at
     least one request with the model's max_model_len.
 
     Args:
@@ -472,15 +531,24 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
     max_model_len = vllm_config.model_config.max_model_len
     needed_memory = 0
     for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.bytes_for_tokens(max_model_len)
+        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
 
     if needed_memory > available_memory:
+        # Estimate the maximum model length that can fit in the available memory
+        estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                                   available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = " Based on the available memory,"
+            f" the estimated maximum model length is {estimated_max_len}."
+
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
+            f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
-            f"increasing `gpu_memory_utilization` or decreasing "
+            f"memory ({available_memory/GiB_bytes:.2f} GiB)."
+            f"{estimated_msg} "
+            f" Try increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
 
@@ -489,15 +557,15 @@ def create_kv_cache_group_specs(
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
      Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same 
+     The layers in the same group should share the same
      KVCacheSpec.
 
      Args:
          kv_cache_spec:
              A mapping from each layer name to its corresponding KVCacheSpec.
          grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer 
-             names that belong to the same group and should share the same 
+             A list of kv cache groups, where each element is a list of layer
+             names that belong to the same group and should share the same
              KVCacheSpec.
      Returns:
          A list of KVCacheGroupSpec objects, one for each group.
@@ -586,6 +654,33 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     return kv_cache_config
 
 
+def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+    """
+    Only models with one type of KV cache are supported yet. This function tries
+    to convert the KV cache specs to one type if the model is a hybrid model 
+    with multiple type of KV cache. It will convert all SlidingWindowSpec to
+    FullAttentionSpec if both types are present.
+    
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+    """
+
+    has_full_attention = any(
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+    has_sliding_window = any(
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+    if has_full_attention and has_sliding_window:
+        for layer_name, spec in kv_cache_spec.items():
+            if isinstance(spec, SlidingWindowSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                )
+
+
 def get_kv_cache_config(vllm_config: VllmConfig,
                         kv_cache_spec: dict[str, KVCacheSpec],
                         available_memory: int) -> KVCacheConfig:
@@ -602,6 +697,7 @@ def get_kv_cache_config(vllm_config: VllmConfig,
         The generated KVCacheConfigs
     """
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    unify_hybrid_kv_cache_specs(kv_cache_spec)
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
@@ -614,11 +710,11 @@ def get_kv_cache_config(vllm_config: VllmConfig,
 
 def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
     """
-    Make the KV cache configurations for each worker consistent, so that all 
+    Make the KV cache configurations for each worker consistent, so that all
     workers can be controlled by the same KVCacheManager.
     This function verifies that the layer group of each worker are the same,
     and changes the num_blocks of each worker to the smallest among all workers.
-    
+
     Args:
         kv_cache_configs: The KV cache configurations for each worker. Will be
             in-place modified to make them consistent.
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index bb883acdb44b6..1d3f1f41f8fbe 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -9,9 +9,10 @@ if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
 
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+        KVConnectorMetadata)
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
 
@@ -122,3 +123,6 @@ class SchedulerOutput:
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
     grammar_bitmask: Optional[npt.NDArray[np.int32]]
+
+    # KV Cache Connector metadata.
+    kv_connector_metadata: Optional[KVConnectorMetadata] = None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 924796e03da7e..5adcdde5bcd79 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,9 +7,12 @@ from collections import deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -19,9 +22,11 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
 from vllm.v1.core.sched.utils import check_stop
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
@@ -31,36 +36,50 @@ class Scheduler(SchedulerInterface):
 
     def __init__(
         self,
-        scheduler_config: SchedulerConfig,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        log_stats: bool,
+        vllm_config: VllmConfig,
+        kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
     ) -> None:
-        self.scheduler_config = scheduler_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.speculative_config = speculative_config
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.kv_cache_config = kv_cache_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
+        # include_finished_set controls whether a separate set of finished
+        # request ids should be included in the EngineCoreOutputs returned
+        # by update_from_outputs(). This is currently used in the multi-engine
+        # case to track request lifetimes efficiently.
+        self.include_finished_set = include_finished_set
+
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = \
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
 
-        num_gpu_blocks = cache_config.num_gpu_blocks
-        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
+        # Create KVConnector for the Scheduler. Note that each Worker
+        # will have a corresponding KVConnector with Role=WORKER.
+        # KV Connector pushes/pull of remote KVs for P/D and offloading.
+        self.connector = None
+        if self.vllm_config.kv_transfer_config is not None:
+            self.connector = KVConnectorFactory.create_connector_v1(
+                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+
+        num_gpu_blocks = self.cache_config.num_gpu_blocks
+        assert num_gpu_blocks is not None and num_gpu_blocks > 0
+
         # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
-            block_size=self.cache_config.block_size,
-            num_gpu_blocks=num_gpu_blocks,
+            kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 
@@ -90,8 +109,9 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            mm_registry=mm_registry,
         )
 
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
@@ -104,6 +124,12 @@ class Scheduler(SchedulerInterface):
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
+        self.num_lookahead_tokens = 0
+        speculative_config = vllm_config.speculative_config
+        if speculative_config and speculative_config.method == "eagle":
+            self.num_lookahead_tokens = \
+                speculative_config.num_speculative_tokens
+
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -152,31 +178,44 @@ class Scheduler(SchedulerInterface):
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                num_new_tokens = min(
-                    num_new_tokens,
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
                     self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
+            # Make sure the input position does not exceed the max model len.
+            # This is necessary when using spec decoding.
+            num_new_tokens = min(
+                num_new_tokens,
+                self.max_model_len - request.num_computed_tokens)
+            assert num_new_tokens > 0
+
             # Schedule encoder inputs.
-            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
-                self._try_schedule_encoder_inputs(request,
-                                                  request.num_computed_tokens,
-                                                  num_new_tokens,
-                                                  encoder_budget))
-            if num_new_tokens == 0:
-                # The request cannot be scheduled because the encoder budget
-                # or the encoder cache is exhausted.
-                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
-                # we do not strictly follow the FCFS scheduling policy and
-                # allow the lower-priority requests to be scheduled.
-                req_index += 1
-                continue
+            if request.has_encoder_inputs:
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled because the encoder budget
+                    # or the encoder cache is exhausted.
+                    # NOTE(woosuk): By using `continue` instead of `break` here,
+                    # we intentionally relax the strict FCFS scheduling policy
+                    # to allow lower-priority requests to be scheduled when a
+                    # higher-priority request is blocked by encoder constraints.
+                    req_index += 1
+                    continue
+            else:
+                encoder_inputs_to_schedule = None
+                new_encoder_budget = encoder_budget
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens)
+                    request,
+                    num_new_tokens,
+                    num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
@@ -258,72 +297,85 @@ class Scheduler(SchedulerInterface):
 
                 request = self.waiting[0]
 
-                # Waiting request skipping logic
-                is_skipped = False
                 # Skip request if the structured output request is still waiting
-                # for FSM.
-                if (not is_skipped
-                        and request.status == RequestStatus.WAITING_FOR_FSM):
+                # for FSM compilation.
+                if request.status == RequestStatus.WAITING_FOR_FSM:
                     structured_output_req = request.structured_output_request
-                    is_skipped = (not structured_output_req
-                                  or not structured_output_req.grammar)
-                    if not is_skipped:
+                    if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
+                    else:
+                        self.waiting.popleft()
+                        skipped_waiting_requests.appendleft(request)
+                        continue
 
-                # Skip request if max_loras can't be honored.
-                if (not is_skipped and self.lora_config
-                        and request.lora_request):
-                    req_lora_id = request.lora_request.lora_int_id
-                    is_skipped = (len(scheduled_loras)
-                                  == self.lora_config.max_loras
-                                  and (req_lora_id not in scheduled_loras))
-
-                if is_skipped:
-                    skipped_waiting_requests.appendleft(request)
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id
+                        not in scheduled_loras):
+                    # Scheduling would exceed max_loras, skip.
                     self.waiting.popleft()
+                    skipped_waiting_requests.appendleft(request)
                     continue
 
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
                     self.kv_cache_manager.get_computed_blocks(request)
+
+                # Get externally-cached tokens if using a KVConnector.
+                num_external_tokens = (
+                    0 if self.connector is None else
+                    self.connector.get_num_new_matched_tokens(
+                        request, num_computed_tokens))
+
+                # Total computed tokens (local + external).
+                num_computed_tokens += num_external_tokens
+
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
-                if num_new_tokens == 0:
-                    # This happens when prompt length is divisible by the block
-                    # size and all blocks are cached. Now we force to recompute
-                    # the last block. Note that we have to re-compute an entire
-                    # block because allocate_slots() assumes num_computed_tokens
-                    # is always a multiple of the block size. This limitation
-                    # can potentially be removed in the future to slightly
-                    # improve the performance.
-                    num_computed_tokens -= self.block_size
-                    num_new_tokens = self.block_size
-                    computed_blocks.pop()
-                if self.scheduler_config.long_prefill_token_threshold > 0:
-                    num_new_tokens = min(
-                        num_new_tokens,
+                if (0 < self.scheduler_config.long_prefill_token_threshold <
+                        num_new_tokens):
+                    num_new_tokens = (
                         self.scheduler_config.long_prefill_token_threshold)
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
 
                 # Schedule encoder inputs.
-                (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
-                     request, num_computed_tokens, num_new_tokens,
-                     encoder_budget)
-                if num_new_tokens == 0:
-                    # The request cannot be scheduled.
-                    break
+                if request.has_encoder_inputs:
+                    (encoder_inputs_to_schedule, num_new_tokens,
+                     new_encoder_budget) = self._try_schedule_encoder_inputs(
+                         request, num_computed_tokens, num_new_tokens,
+                         encoder_budget)
+                    if num_new_tokens == 0:
+                        # The request cannot be scheduled.
+                        break
+                else:
+                    encoder_inputs_to_schedule = None
+                    new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens, computed_blocks)
+                    request,
+                    num_new_tokens + num_external_tokens,
+                    computed_blocks,
+                    num_lookahead_tokens=self.num_lookahead_tokens,
+                )
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
 
+                # KVConnector: update internal state after allocation.
+                # This information is used to determine if a load is
+                # needed for this request.
+                if self.connector is not None:
+                    self.connector.update_state_after_alloc(
+                        request,
+                        num_external_tokens,
+                    )
+
                 self.waiting.popleft()
                 if request.use_structured_output:
                     structured_output_request_ids[
@@ -432,6 +484,26 @@ class Scheduler(SchedulerInterface):
             grammar_bitmask=grammar_bitmask,
         )
 
+        # NOTE(Kuntai): this function is designed for multiple purposes:
+        # 1. Plan the KV cache store
+        # 2. Wrap up all the KV cache load / save ops into an opaque object
+        # 3. Clear the internal states of the connector
+        if self.connector is not None:
+            meta = self.connector.build_connector_meta(scheduler_output)
+            scheduler_output.kv_connector_metadata = meta
+
+        # Advance the number of computed tokens for the request AFTER
+        # the request is scheduled.
+        # 1. The scheduler_output of the current step has to include the
+        #    original number of scheduled tokens to determine input IDs.
+        # 2. Advance the number of computed tokens here allowing us to
+        #    schedule the prefill request again immediately in the next
+        #    scheduling step.
+        # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+        #    computed tokens will be adjusted in update_from_output.
+        for req_id, num_scheduled_token in num_scheduled_tokens.items():
+            self.requests[req_id].num_computed_tokens += num_scheduled_token
+
         self.finished_req_ids = set()
         return scheduler_output
 
@@ -485,17 +557,17 @@ class Scheduler(SchedulerInterface):
         If an encoder input cannot be scheduled due to cache or budget
         limitations, the method adjusts `num_new_tokens` to schedule only the
         decoder tokens up to just before the unschedulable encoder input.
-        """
-        if not request.has_encoder_inputs():
-            return [], num_new_tokens, encoder_budget
 
+        Note that num_computed_tokens includes both locally cached
+        blocks and externally cached blocks (via KVConnector).
+        """
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info["offset"]
-            num_encoder_tokens = pos_info["length"]
+            start_pos = pos_info.offset
+            num_encoder_tokens = pos_info.length
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -511,6 +583,17 @@ class Scheduler(SchedulerInterface):
             if self.encoder_cache_manager.has_cache(request, i):
                 # The encoder input is already computed and cached.
                 continue
+
+            # If no encoder input chunking is allowed, we do not want to
+            # partially schedule a multimodal item. If the scheduled range would
+            # only cover part of the mm input, roll back to before the mm item.
+            if (self.scheduler_config.disable_chunked_mm_input
+                    and num_computed_tokens < start_pos
+                    and (num_computed_tokens + num_new_tokens)
+                    < (start_pos + num_encoder_tokens)):
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
             if (not self.encoder_cache_manager.can_allocate(request, i)
                     or num_encoder_tokens > encoder_budget):
                 # The encoder cache is full or the encoder budget is exhausted.
@@ -546,6 +629,7 @@ class Scheduler(SchedulerInterface):
 
         new_running: list[Request] = []
         outputs: list[EngineCoreOutput] = []
+        spec_decoding_stats: Optional[SpecDecodingStats] = None
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
         # loop can be a performance bottleneck. We should do our best to avoid
@@ -560,36 +644,32 @@ class Scheduler(SchedulerInterface):
 
             req_index = model_runner_output.req_id_to_index[req_id]
             generated_token_ids = sampled_token_ids[req_index]
-            if req_id not in scheduler_output.scheduled_spec_decode_tokens:
-                # When the request's num_computed_tokens catches up
-                # its num_tokens, the request generates output tokens.
-                # Otherwise, we ignore the sampler output for the request.
-                request.num_computed_tokens += num_tokens_scheduled
-                assert request.num_computed_tokens <= request.num_tokens
-            else:
-                # num_computed_tokens_step represents the number of tokens
-                # processed in the current step, considering scheduled
-                # tokens and rejections.
-                # It is calculated as:
-                # num_computed_tokens_step = num_scheduled_tokens -
-                #                            num_tokens_rejected,
-                # where num_tokens_rejected is given by:
-                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
-                scheduled_spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens[req_id])
 
-                num_computed_tokens_step = num_scheduled_tokens[req_id] - (
-                    len(scheduled_spec_token_ids) + 1 -
-                    len(generated_token_ids))
-                request.num_computed_tokens += num_computed_tokens_step
+            scheduled_spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+            if scheduled_spec_token_ids:
+                # num_computed_tokens represents the number of tokens
+                # processed in the current step, considering scheduled
+                # tokens and rejections. If some tokens are rejected,
+                # num_computed_tokens is decreased by the number of rejected
+                # tokens, where is given by:
+                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
+                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
+                                       len(generated_token_ids))
+                request.num_computed_tokens -= num_tokens_rejected
+                spec_decoding_stats = self.make_spec_decoding_stats(
+                    spec_decoding_stats,
+                    num_draft_tokens=len(scheduled_spec_token_ids),
+                    num_accepted_tokens=len(generated_token_ids) - 1)
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
             # OPTIMIZATION: Avoid list(set) if the set is empty.
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
-                    start_pos = request.mm_positions[input_id]["offset"]
-                    num_tokens = request.mm_positions[input_id]["length"]
+                    mm_positions = request.mm_positions[input_id]
+                    start_pos = mm_positions.offset
+                    num_tokens = mm_positions.length
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
@@ -602,35 +682,34 @@ class Scheduler(SchedulerInterface):
 
             stopped = False
             new_logprobs = None
-            new_token_ids: list[int] = []
+            new_token_ids = generated_token_ids
 
-            if request.num_computed_tokens >= request.num_tokens:
-                for output_token_id in generated_token_ids:
-                    request.append_output_token_ids(output_token_id)
-                    new_token_ids.append(output_token_id)
+            # Append generated tokens and check for stop. Note that if
+            # a request is still being prefilled, we expect the model runner
+            # to return empty token ids for the request.
+            for num_new, output_token_id in enumerate(new_token_ids, 1):
+                request.append_output_token_ids(output_token_id)
 
-                    # Check for stop and update request state.
-                    # This must be called before we make the EngineCoreOutput.
-                    stopped = check_stop(request, self.max_model_len)
-                    if stopped:
-                        self._free_request(request)
-                        break
+                # Check for stop and update request state.
+                # This must be called before we make the EngineCoreOutput.
+                stopped = check_stop(request, self.max_model_len)
+                if stopped:
+                    self._free_request(request)
+                    del new_token_ids[num_new:]  # Trim new tokens if needed.
+                    break
 
-                # Extract sample logprobs if needed.
-                if request.sampling_params.logprobs is not None:
-                    assert logprobs is not None
-                    # NOTE: once we support N tokens per step (spec decode),
-                    # the outer lists can be of length > 1.
-                    new_logprobs = logprobs.slice(req_index, req_index + 1)
+            # Extract sample logprobs if needed.
+            if request.sampling_params.logprobs is not None and logprobs:
+                # NOTE: once we support N tokens per step (spec decode),
+                # the outer lists can be of length > 1.
+                new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and request.use_structured_output:
                 # NOTE: structured_output_request
                 # should not be None if use_structured_output, we have
                 # check above, so safe to ignore type warning
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    request.request_id,
-                    new_token_ids,
-                )
+                    req_id, new_token_ids)
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
@@ -649,15 +728,21 @@ class Scheduler(SchedulerInterface):
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
 
-            self.scheduled_req_ids.remove(request.request_id)
+            self.scheduled_req_ids.remove(req_id)
             if not stopped:
                 new_running.append(request)
 
         self.running = new_running
-        return EngineCoreOutputs(
+        engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(),
+            scheduler_stats=self.make_stats(spec_decoding_stats),
         )
+        if self.include_finished_set:
+            #TODO currently sending duplicates here, improve this
+            engine_core_outputs.finished_requests = (
+                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        return engine_core_outputs
 
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
@@ -717,12 +802,32 @@ class Scheduler(SchedulerInterface):
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> Optional[SchedulerStats]:
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
+        prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
+        assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
-            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            prefix_cache_stats=prefix_cache_stats,
+            spec_decoding_stats=spec_decoding_stats,
         )
+
+    def make_spec_decoding_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats],
+        num_draft_tokens: int,
+        num_accepted_tokens: int,
+    ) -> Optional[SpecDecodingStats]:
+        if not self.log_stats:
+            return None
+        if spec_decoding_stats is None:
+            spec_decoding_stats = SpecDecodingStats()
+        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
+                                    num_accepted_tokens=num_accepted_tokens)
+        return spec_decoding_stats
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py
new file mode 100644
index 0000000000000..7a8a98361c7ed
--- /dev/null
+++ b/vllm/v1/core/specialized_manager.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+
+from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
+                                        SlidingWindowSpec)
+
+
+class SpecializedManager(ABC):
+    """
+    An abstract base class for specialized managers that handle the kv
+    cache management logic of different attention layers.
+    """
+
+    def __init__(
+        self,
+        kv_cache_spec: KVCacheSpec,
+        block_pool: BlockPool,
+    ) -> None:
+        """
+        Initializes the SpecializedManager.
+        Args:
+            kv_cache_spec: The kv_cache_spec for this manager.
+            block_pool: The block pool.
+        """
+
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_pool = block_pool
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        """
+        Get the longest cache hit prefix of the blocks. If no cache hit is 
+        found, return an empty list.
+
+        Args:
+            block_hashes: The block hashes of the request.
+        Returns:
+            A list of cached blocks with skipped blocks replaced by null block.
+            For example, sliding window manager should return a list like
+            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
+            sliding window 8. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        """
+        Remove the blocks that are no longer needed from `blocks`. The removed 
+        blocks should be replaced by null_block. Return the removed blocks in 
+        eviction order, where the first returned block should be evicted first.
+        Don't free the removed blocks in this function.
+
+        Args:
+            blocks: The list of blocks to be updated.
+            num_computed_tokens: The number of tokens that have been computed.
+        Returns:
+            The removed blocks in eviction order.
+        """
+        raise NotImplementedError
+
+
+class FullAttentionManager(SpecializedManager):
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        computed_blocks: list[KVCacheBlock] = []
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self.block_pool.get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # No need to remove blocks for full attention.
+        return []
+
+
+class SlidingWindowManager(SpecializedManager):
+
+    def __init__(self, kv_cache_spec: SlidingWindowSpec,
+                 block_pool: BlockPool):
+        super().__init__(kv_cache_spec, block_pool)
+        self.sliding_window = kv_cache_spec.sliding_window
+        # The number of contiguous blocks needed for prefix cache hit.
+        # -1 since the input token itself is also included in the window
+        self.sliding_window_contiguous_blocks = cdiv(
+            (kv_cache_spec.sliding_window - 1), self.block_size)
+        self._null_block = block_pool.null_block
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
+        # optimize the time complexity from O(len(block_hashes)) to
+        # O(len(block_hashes) / sliding_window_contiguous_blocks +
+        # sliding_window_contiguous_blocks),
+        # which is good for low cache hit rate scenarios.
+        computed_blocks = [self._null_block] * len(block_hashes)
+        num_contiguous_blocks = 0
+
+        # Search from right to left and early stop when a match is found.
+        for i in range(len(block_hashes) - 1, -1, -1):
+            if cached_block := self.block_pool.get_cached_block(
+                    block_hashes[i]):
+                computed_blocks[i] = cached_block
+                num_contiguous_blocks += 1
+                if (num_contiguous_blocks
+                        >= self.sliding_window_contiguous_blocks):
+                    # Trim the trailing blocks.
+                    # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
+                    # when sliding_window_contiguous_blocks=2.
+                    del computed_blocks[i + num_contiguous_blocks:]
+                    return computed_blocks
+            else:
+                num_contiguous_blocks = 0
+        # The first `num_contiguous_blocks` is a cache hit even if
+        # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+        del computed_blocks[num_contiguous_blocks:]
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # Remove the blocks that are no longer be in the sliding window and
+        # skipped during the attention computation.
+        last_useful_token = num_computed_tokens - self.sliding_window + 1
+        last_useful_block = last_useful_token // self.block_size
+
+        removed_blocks: list[KVCacheBlock] = []
+        for i in range(last_useful_block - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        return removed_blocks
+
+
+spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
+    FullAttentionSpec: FullAttentionManager,
+    SlidingWindowSpec: SlidingWindowManager,
+}
+
+
+def get_specialized_manager(kv_cache_spec: KVCacheSpec,
+                            block_pool: BlockPool) -> SpecializedManager:
+    manager_class = spec_manager_map[type(kv_cache_spec)]
+    manager = manager_class(kv_cache_spec, block_pool)
+    return manager
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3699779b3a0fe..5f5675b955fad 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -2,6 +2,7 @@
 
 import enum
 import time
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
@@ -52,7 +53,7 @@ class EngineCoreRequest(
     # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: list[int]
-    mm_inputs: Optional[list[MultiModalKwargs]]
+    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: SamplingParams
@@ -60,6 +61,11 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -128,12 +134,22 @@ class EngineCoreOutputs(
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
+    engine_index: int = 0
+
     # [num_reqs]
     outputs: list[EngineCoreOutput] = []
     scheduler_stats: Optional[SchedulerStats] = None
     timestamp: float = 0.0
 
     utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -147,4 +163,7 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    UTILITY = b'\x02'
+    START_DP_WAVE = b'\x02'
+    UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3a6811db31327..bc49a0d3bb5bf 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
 import logging
-import os
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
 from typing import Optional, Union
@@ -14,10 +12,11 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -25,9 +24,10 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv, kill_process_tree
+from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
 from vllm.v1.engine.parallel_sampling import ParentRequest
@@ -48,7 +48,7 @@ class AsyncLLM(EngineClient):
         executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
@@ -60,17 +60,21 @@ class AsyncLLM(EngineClient):
                 "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
-        assert start_engine_loop
-
         self.model_config = vllm_config.model_config
-
+        self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: list[StatLoggerBase] = []
+
+        # Set up stat loggers; independent set for each DP rank.
+        self.stat_loggers: list[list[StatLoggerBase]] = []
         if self.log_stats:
-            if logger.isEnabledFor(logging.INFO):
-                self.stat_loggers.append(LoggingStatLogger())
-            self.stat_loggers.append(PrometheusStatLogger(vllm_config))
+            for i in range(vllm_config.parallel_config.data_parallel_size):
+                loggers: list[StatLoggerBase] = []
+                if logger.isEnabledFor(logging.INFO):
+                    loggers.append(LoggingStatLogger(engine_index=i))
+                loggers.append(
+                    PrometheusStatLogger(vllm_config, engine_index=i))
+                self.stat_loggers.append(loggers)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -84,7 +88,7 @@ class AsyncLLM(EngineClient):
         self.processor = Processor(
             vllm_config=vllm_config,
             tokenizer=self.tokenizer,
-            input_registry=input_registry,
+            mm_registry=mm_registry,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
@@ -92,15 +96,23 @@ class AsyncLLM(EngineClient):
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = EngineCoreClient.make_client(
-            multiprocess_mode=True,
-            asyncio_mode=True,
+        core_client_class = AsyncMPClient if (
+            vllm_config.parallel_config.data_parallel_size
+            == 1) else DPAsyncMPClient
+
+        self.engine_core = core_client_class(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
 
     @classmethod
     def from_vllm_config(
@@ -158,6 +170,9 @@ class AsyncLLM(EngineClient):
             usage_context=usage_context,
         )
 
+    def __del__(self):
+        self.shutdown()
+
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
@@ -180,6 +195,9 @@ class AsyncLLM(EngineClient):
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
+        if self.errored:
+            raise EngineDeadError()
+
         assert isinstance(params, SamplingParams), \
             "Pooling is not supported in V1"
 
@@ -254,9 +272,7 @@ class AsyncLLM(EngineClient):
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
-            if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
+            self._run_output_handler()
 
             q = await self.add_request(
                 request_id,
@@ -281,61 +297,96 @@ class AsyncLLM(EngineClient):
                 finished = out.finished
                 yield out
 
-        # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
             raise
 
-    async def _run_output_handler(self):
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
-        try:
-            while True:
-                # 1) Pull EngineCoreOutputs from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
-                num_outputs = len(outputs.outputs)
+        if self.output_handler is not None:
+            return
 
-                iteration_stats = IterationStats() if (
-                    self.log_stats and num_outputs) else None
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        stat_loggers = self.stat_loggers if log_stats else None
 
-                # Split outputs into chunks of at most
-                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
-                # event loop for too long.
-                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                    slices = (outputs.outputs, )
-                else:
-                    slices = np.array_split(
-                        outputs.outputs,
-                        cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
 
-                for i, outputs_slice in enumerate(slices):
-                    # 2) Process EngineCoreOutputs.
-                    processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, outputs.timestamp, iteration_stats)
-                    # NOTE: RequestOutputs are pushed to their queues.
-                    assert not processed_outputs.request_outputs
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
 
-                    # Allow other asyncio tasks to run between chunks
-                    if i + 1 < len(slices):
-                        await asyncio.sleep(0)
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
 
-                    # 3) Abort any reqs that finished due to stop strings.
-                    await self.engine_core.abort_requests_async(
-                        processed_outputs.reqs_to_abort)
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
 
-                # 4) Logging.
-                # TODO(rob): make into a coroutine and launch it in
-                # background thread once Prometheus overhead is non-trivial.
-                self._record_stats(
-                    scheduler_stats=outputs.scheduler_stats,
-                    iteration_stats=iteration_stats,
-                )
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
 
-        except Exception as e:
-            logger.exception("EngineCore output handler hit an error: %s", e)
-            kill_process_tree(os.getpid())
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if stat_loggers:
+                        assert outputs.scheduler_stats is not None
+                        AsyncLLM._record_stats(
+                            stat_loggers[outputs.engine_index],
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
@@ -346,16 +397,15 @@ class AsyncLLM(EngineClient):
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
+    @staticmethod
     def _record_stats(
-        self,
-        scheduler_stats: Optional[SchedulerStats],
+        stat_loggers: list[StatLoggerBase],
+        scheduler_stats: SchedulerStats,
         iteration_stats: Optional[IterationStats],
     ):
-        if not self.log_stats:
-            return
-
-        assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers:
+        """static so that it can be used from the output_handler task
+        without a circular ref to AsyncLLM."""
+        for stat_logger in stat_loggers:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
@@ -370,6 +420,9 @@ class AsyncLLM(EngineClient):
     ):
         raise ValueError("Not Supported on V1 yet.")
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
@@ -393,8 +446,9 @@ class AsyncLLM(EngineClient):
         scheduler_outputs=None,
         model_output=None,
     ) -> None:
-        for stat_logger in self.stat_loggers:
-            stat_logger.log()
+        for loggers in self.stat_loggers:
+            for stat_logger in loggers:
+                stat_logger.log()
 
     async def check_health(self) -> None:
         logger.debug("Called check_health.")
@@ -414,8 +468,8 @@ class AsyncLLM(EngineClient):
     async def sleep(self, level: int = 1) -> None:
         await self.engine_core.sleep_async(level)
 
-    async def wake_up(self) -> None:
-        await self.engine_core.wake_up_async()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        await self.engine_core.wake_up_async(tags)
 
     async def is_sleeping(self) -> bool:
         return await self.engine_core.is_sleeping_async()
@@ -438,16 +492,17 @@ class AsyncLLM(EngineClient):
 
     @property
     def is_running(self) -> bool:
-        return True
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
 
     @property
     def is_stopped(self) -> bool:
-        return False
+        return self.errored
 
     @property
     def errored(self) -> bool:
-        return False
+        return self.engine_core.resources.engine_dead or not self.is_running
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception()  # TODO: implement
+        return EngineDeadError()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 42511777feebb..572e052cdcc27 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,26 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import os
 import queue
 import signal
+import sys
 import threading
 import time
 from concurrent.futures import Future
 from inspect import isclass, signature
-from multiprocessing.connection import Connection
-from typing import Any, Optional
+from logging import DEBUG
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import msgspec
-import psutil
 import zmq
-import zmq.asyncio
 
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
-                        zmq_socket_ctx)
+from vllm.utils import resolve_obj_by_qualname, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -28,8 +28,9 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
-from vllm.v1.engine.mm_input_cache import MMInputCacheServer
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -40,16 +41,17 @@ logger = init_logger(__name__)
 
 POLLING_TIMEOUT_S = 2.5
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -59,10 +61,14 @@ class EngineCore:
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)
 
         # Setup KV Caches and update CacheConfig after profiling.
-        num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
-            vllm_config)
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
+            self._initialize_kv_caches(vllm_config)
+
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -86,17 +92,16 @@ class EngineCore:
                 vllm_config.scheduler_config.scheduler_cls)
 
         self.scheduler: SchedulerInterface = Scheduler(
-            scheduler_config=vllm_config.scheduler_config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
-            speculative_config=vllm_config.speculative_config,
-            log_stats=self.log_stats,
+            vllm_config=vllm_config,
+            kv_cache_config=kv_cache_config,
             structured_output_manager=self.structured_output_manager,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size
+            > 1,
+            log_stats=self.log_stats,
         )
 
         # Setup MM Input Mapper.
-        self.mm_input_cache_server = MMInputCacheServer(
+        self.mm_input_cache_server = MirroredProcessingCache(
             vllm_config.model_config)
 
         # Setup batch queue for pipeline parallelism.
@@ -111,8 +116,8 @@ class EngineCore:
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
-    def _initialize_kv_caches(self,
-                              vllm_config: VllmConfig) -> tuple[int, int]:
+    def _initialize_kv_caches(
+            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -137,13 +142,14 @@ class EngineCore:
         unify_kv_cache_configs(kv_cache_configs)
 
         # All workers have the same kv_cache_config except layer names, so use
-        # an arbitrary one to get the number of blocks.
+        # an arbitrary one to initialize the scheduler.
         assert all([
             cfg.num_blocks == kv_cache_configs[0].num_blocks
             for cfg in kv_cache_configs
         ])
         num_gpu_blocks = kv_cache_configs[0].num_blocks
         num_cpu_blocks = 0
+        scheduler_kv_cache_config = kv_cache_configs[0]
 
         # Initialize kv cache and warmup the execution
         self.model_executor.initialize_from_config(kv_cache_configs)
@@ -151,7 +157,7 @@ class EngineCore:
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
                      "warmup model) took %.2f seconds"), elapsed)
-        return num_gpu_blocks, num_cpu_blocks
+        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
@@ -163,7 +169,7 @@ class EngineCore:
             # anything that has a hash must have a HIT cache entry here
             # as well.
             assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
@@ -243,7 +249,8 @@ class EngineCore:
         return engine_core_outputs
 
     def shutdown(self):
-        self.model_executor.shutdown()
+        if self.model_executor:
+            self.model_executor.shutdown()
 
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
@@ -254,8 +261,8 @@ class EngineCore:
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
 
-    def wake_up(self):
-        self.model_executor.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
@@ -275,41 +282,72 @@ class EngineCore:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_executor.save_sharded_state(path=path,
+                                               pattern=pattern,
+                                               max_size=max_size)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
     def __init__(
         self,
         input_path: str,
         output_path: str,
-        ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        engine_index: int = 0,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+
+        executor_fail_callback = lambda: input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         executor_fail_callback)
+
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+        self.engines_running = False
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                            Any]] = queue.Queue()
-        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.input_queue = input_queue
+        self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
         threading.Thread(target=self.process_input_socket,
-                         args=(input_path, ),
+                         args=(input_path, engine_index),
                          daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, ),
-                         daemon=True).start()
-
-        # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": "READY"})
+        self.output_thread = threading.Thread(
+            target=self.process_output_socket,
+            args=(output_path, engine_index),
+            daemon=True)
+        self.output_thread.start()
 
     @staticmethod
-    def run_engine_core(*args, **kwargs):
+    def run_engine_core(*args,
+                        dp_rank: int = 0,
+                        local_dp_rank: int = 0,
+                        **kwargs):
         """Launch EngineCore busy loop in background process."""
 
         # Signal handler used for graceful termination.
@@ -330,20 +368,30 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        parent_process = psutil.Process().parent()
-        engine_core = None
+        engine_core: Optional[EngineCoreProc] = None
         try:
-            engine_core = EngineCoreProc(*args, **kwargs)
+            parallel_config: ParallelConfig = kwargs[
+                "vllm_config"].parallel_config
+            if parallel_config.data_parallel_size > 1:
+                # Set data parallel rank for this engine process.
+                parallel_config.data_parallel_rank = dp_rank
+                parallel_config.data_parallel_rank_local = local_dp_rank
+                engine_core = DPEngineCoreProc(*args, **kwargs)
+            else:
+                engine_core = EngineCoreProc(*args, **kwargs)
+
             engine_core.run_busy_loop()
 
         except SystemExit:
-            logger.debug("EngineCore interrupted.")
-
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGUSR1)
-
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
@@ -351,28 +399,40 @@ class EngineCoreProc(EngineCore):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        step_fn = (self.step
-                   if self.batch_queue is None else self.step_with_batch_queue)
-
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            while not self.scheduler.has_requests():
-                logger.debug("EngineCore busy loop waiting.")
-                req = self.input_queue.get()
-                self._handle_client_request(*req)
+            self._process_input_queue()
+            # 2) Step the engine core and return the outputs.
+            self._process_engine_step()
 
-            # 2) Handle any new client requests.
-            while not self.input_queue.empty():
-                req = self.input_queue.get_nowait()
-                self._handle_client_request(*req)
+    def _process_input_queue(self):
+        """Exits when an engine step needs to be performed."""
 
-            # 3) Step the engine core.
-            outputs = step_fn()
+        waited = False
+        while not self.engines_running and not (self.scheduler.has_requests()):
+            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
+                logger.debug("EngineCore waiting for work.")
+                waited = True
+            req = self.input_queue.get()
+            self._handle_client_request(*req)
 
-            # 4) Put EngineCoreOutputs into the output queue.
-            if outputs is not None:
-                self.output_queue.put_nowait(outputs)
+        if waited:
+            logger.debug("EngineCore loop active.")
+
+        # Handle any more client requests.
+        while not self.input_queue.empty():
+            req = self.input_queue.get_nowait()
+            self._handle_client_request(*req)
+
+    def _process_engine_step(self):
+        """Called only when there are unfinished local requests."""
+
+        # Step the engine core.
+        outputs = self.step_fn()
+        # Put EngineCoreOutputs into the output queue.
+        if outputs is not None:
+            self.output_queue.put_nowait(outputs)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
@@ -395,6 +455,11 @@ class EngineCoreProc(EngineCore):
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
                 EngineCoreOutputs(utility_output=output))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)
 
     @staticmethod
     def _convert_msgspec_args(method, args):
@@ -410,29 +475,49 @@ class EngineCoreProc(EngineCore):
             and not isinstance(v, p.annotation) else v
             for v, p in zip(args, arg_types))
 
-    def process_input_socket(self, input_path: str):
+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD in the queue.
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
+    def process_input_socket(self, input_path: str, engine_index: int):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
         generic_decoder = MsgpackDecoder()
+        identity = engine_index.to_bytes(length=2, byteorder="little")
+
+        with zmq_socket_ctx(input_path,
+                            zmq.DEALER,
+                            identity=identity,
+                            bind=False) as socket:
+
+            # Send ready message to front-end once input socket is connected.
+            socket.send(b'READY')
 
-        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
-                type_frame, data_frame = socket.recv_multipart(copy=False)
+                type_frame, *data_frames = socket.recv_multipart(copy=False)
                 request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
                 decoder = add_request_decoder if (
                     request_type
                     == EngineCoreRequestType.ADD) else generic_decoder
-                request = decoder.decode(data_frame.buffer)
+                request = decoder.decode(data_frames)
 
                 # Push to input queue for core busy loop.
                 self.input_queue.put_nowait((request_type, request))
 
-    def process_output_socket(self, output_path: str):
+    def process_output_socket(self, output_path: str, engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -440,8 +525,153 @@ class EngineCoreProc(EngineCore):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
+                            linger=4000) as socket:
             while True:
                 outputs = self.output_queue.get()
-                encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer, ), copy=False)
+                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
+                    socket.send(outputs, copy=False)
+                    break
+                assert not isinstance(outputs, bytes)
+                outputs.engine_index = engine_index
+                buffers = encoder.encode_into(outputs, buffer)
+                socket.send_multipart(buffers, copy=False)
+
+
+class DPEngineCoreProc(EngineCoreProc):
+    """ZMQ-wrapper for running EngineCore in background process
+    in a data parallel context."""
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+    ):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+
+        assert dp_size > 1
+        assert 0 <= local_dp_rank <= dp_rank < dp_size
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            from vllm.platforms.cuda import device_id_to_physical_device_id
+            tp_size = vllm_config.parallel_config.tensor_parallel_size
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                str(device_id_to_physical_device_id(i))
+                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+                               tp_size))
+
+        self.local_dp_rank = local_dp_rank
+        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.current_wave = 0
+
+        # Initialize the engine after setting up environment.
+        super().__init__(input_path, output_path, vllm_config, executor_class,
+                         log_stats, dp_rank)
+
+        # Counts forward-passes of the model so that we can synchronize
+        # finished with DP peers every N steps.
+        self.counter = 0
+
+    def shutdown(self):
+        super().shutdown()
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
+
+    def add_request(self, request: EngineCoreRequest):
+        if request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    EngineCoreOutputs(start_wave=self.current_wave))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave: int = request
+            if new_wave >= self.current_wave:
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore for data parallel case."""
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+
+            if local_unfinished_reqs:
+                # 2) Step the engine core.
+                self._process_engine_step()
+
+                # Check if we have now finished all requests.
+                local_unfinished_reqs = (
+                    self.scheduler.has_unfinished_requests())
+            else:
+                if self.scheduler.has_finished_requests():
+                    # There are no unfinished requests, but there are some
+                    # finished requests remaining to be removed from the
+                    # batch state. This engine step won't perform a forward
+                    # pass but will flush the finished requests to ensure
+                    # up-to-date state is returned in the engine outputs.
+                    self._process_engine_step()
+
+                if not self.engines_running:
+                    # All engines are idle.
+                    continue
+
+                # There must be unfinished requests in DP peers, run a
+                # dummy forward pass.
+                self.execute_dummy_batch()
+
+            # 3) All-reduce operation to determine global unfinished reqs.
+            self.engines_running = self._has_global_unfinished_reqs(
+                local_unfinished_reqs)
+
+            if not self.engines_running:
+                if self.local_dp_rank == 0:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    self.output_queue.put_nowait(
+                        EngineCoreOutputs(wave_complete=self.current_wave))
+                self.current_wave += 1
+
+    def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
+
+        # Optimization - only perform finish-sync all-reduce every 24 steps.
+        self.counter += 1
+        if self.counter != 24:
+            return True
+        self.counter = 0
+
+        return ParallelConfig.has_unfinished_dp(self.dp_group,
+                                                local_unfinished)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 13b72c80dc0d4..a2727d995e7d9 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,17 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
-import os
 import queue
-import signal
-import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
+from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import zmq
 import zmq.asyncio
@@ -20,18 +17,23 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
-                        kill_process_tree, make_zmq_socket)
+                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
 
 AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
+STARTUP_POLL_PERIOD_MS = 10000
+
 
 class EngineCoreClient(ABC):
     """
@@ -60,6 +62,9 @@ class EngineCoreClient(ABC):
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
+            if vllm_config.parallel_config.data_parallel_size > 1:
+                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
+
             return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
@@ -86,7 +91,7 @@ class EngineCoreClient(ABC):
     def sleep(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     def is_sleeping(self) -> bool:
@@ -113,6 +118,19 @@ class EngineCoreClient(ABC):
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
     async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
@@ -128,7 +146,7 @@ class EngineCoreClient(ABC):
     async def sleep_async(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    async def wake_up_async(self) -> None:
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     async def is_sleeping_async(self) -> bool:
@@ -149,6 +167,20 @@ class EngineCoreClient(ABC):
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
 
 class InprocClient(EngineCoreClient):
     """
@@ -185,8 +217,8 @@ class InprocClient(EngineCoreClient):
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
@@ -206,23 +238,87 @@ class InprocClient(EngineCoreClient):
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.engine_core.save_sharded_state(path, pattern, max_size)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        input_path: str,
+        output_path: str,
+        index: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        self.index = index
+        self.identity = index.to_bytes(length=2, byteorder="little")
+        try:
+            # Start EngineCore in background process.
+            self.proc_handle = BackgroundProcHandle(
+                input_path=input_path,
+                output_path=output_path,
+                process_name=f"EngineCore_{index}",
+                target_fn=EngineCoreProc.run_engine_core,
+                process_kwargs={
+                    "vllm_config": vllm_config,
+                    "dp_rank": index,
+                    "local_dp_rank": local_dp_rank,
+                    "executor_class": executor_class,
+                    "log_stats": log_stats,
+                })
+
+            self.num_reqs_in_flight = 0
+        finally:
+            if not hasattr(self, "num_reqs_in_flight"):
+                # Ensure socket is closed if process fails to start.
+                self.close()
+
+    def close(self):
+        if proc_handle := getattr(self, "proc_handle", None):
+            proc_handle.shutdown()
+
 
 @dataclass
 class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: zmq.Context
+    ctx: Union[zmq.Context]
+    core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    proc_handle: Optional[BackgroundProcHandle] = None
+    output_queue_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
+    # Set if any of the engines are dead. Here so that the output
+    # processing threads can access it without holding a ref to the client.
+    engine_dead: bool = False
+
     def __call__(self):
         """Clean up background resources."""
 
-        if self.proc_handle is not None:
-            self.proc_handle.shutdown()
+        self.engine_dead = True
+        for core_engine in self.core_engines:
+            core_engine.close()
+
+        if self.output_queue_task is not None:
+            self.output_queue_task.cancel()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
@@ -237,6 +333,12 @@ class BackgroundResources:
                 # Send shutdown signal.
                 shutdown_sender.send(b'')
 
+    def validate_alive(self, frames: Sequence[zmq.Frame]):
+        if len(frames) == 1 and (frames[0].buffer
+                                 == EngineCoreProc.ENGINE_CORE_DEAD):
+            self.engine_dead = True
+            raise EngineDeadError()
+
 
 class MPClient(EngineCoreClient):
     """
@@ -258,66 +360,105 @@ class MPClient(EngineCoreClient):
         executor_class: type[Executor],
         log_stats: bool,
     ):
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO(rob): rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling vLLM.
-        def sigusr1_handler(signum, frame):
-            logger.fatal("Got fatal signal from worker processes, shutting "
-                         "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        if threading.current_thread() == threading.main_thread():
-            signal.signal(signal.SIGUSR1, sigusr1_handler)
-        else:
-            logger.warning("SIGUSR1 handler not installed because we are not "
-                           "running in the main thread. In this case the "
-                           "forked engine process may not be killed when "
-                           "an exception is raised, and you need to handle "
-                           "the engine process shutdown manually.")
-
         # Serialization setup.
         self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        sync_ctx = zmq.Context()
+        sync_ctx = zmq.Context(io_threads=2)
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
-        # when the client is garbage collected,  even if an
+        # when the client is garbage collected, even if an
         # exception is raised mid-construction.
         self.resources = BackgroundResources(ctx=sync_ctx)
         self._finalizer = weakref.finalize(self, self.resources)
+        success = False
+        try:
+            # Paths and sockets for IPC.
+            self.output_path = get_open_zmq_ipc_path()
+            input_path = get_open_zmq_ipc_path()
+            self.input_socket = make_zmq_socket(self.ctx,
+                                                input_path,
+                                                zmq.ROUTER,
+                                                bind=True)
+            self.resources.input_socket = self.input_socket
 
-        # Paths for IPC.
-        self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
+            new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+                vllm_config, executor_class, log_stats, input_path, self.
+                output_path, index, local_dp_rank)
 
-        # Start EngineCore in background process.
-        self.resources.proc_handle = BackgroundProcHandle(
-            input_path=input_path,
-            output_path=self.output_path,
-            process_name="EngineCore",
-            target_fn=EngineCoreProc.run_engine_core,
-            process_kwargs={
-                "vllm_config": vllm_config,
-                "executor_class": executor_class,
-                "log_stats": log_stats,
-            })
+            # Start engine core process(es).
+            self._init_core_engines(vllm_config, new_core_engine,
+                                    self.resources.core_engines)
 
-        # Create input socket.
-        self.resources.input_socket = make_zmq_socket(self.ctx, input_path,
-                                                      zmq.constants.PUSH)
-        self.input_socket = self.resources.input_socket
-        self.utility_results: dict[int, AnyFuture] = {}
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup()
+
+            self.utility_results: dict[int, AnyFuture] = {}
+            success = True
+        finally:
+            if not success:
+                self._finalizer()
+
+    def _wait_for_engine_startup(self):
+        # Get a sync handle to the socket which can be sync or async.
+        sync_input_socket = zmq.Socket.shadow(self.input_socket)
+
+        # Wait for engine core process(es) to send ready messages.
+        identities = set(eng.index for eng in self.resources.core_engines)
+        poller = zmq.Poller()
+        poller.register(sync_input_socket, zmq.POLLIN)
+        for eng in self.resources.core_engines:
+            poller.register(eng.proc_handle, zmq.POLLIN)
+        while identities:
+            events = poller.poll(STARTUP_POLL_PERIOD_MS)
+            if not events:
+                logger.debug("Waiting for %d core engine proc(s) to start: %s",
+                             len(identities), identities)
+                continue
+            if len(events) > 1 or events[0][0] != sync_input_socket:
+                # One of the core processes exited.
+                raise RuntimeError("Engine core initialization failed. "
+                                   "See root cause above.")
+
+            eng_id_bytes, msg = sync_input_socket.recv_multipart()
+            eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
+            if eng_id not in identities:
+                raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
+            if msg != b'READY':
+                raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
+            logger.info("Core engine process %d ready.", eng_id)
+            identities.discard(eng_id)
+
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Default case - single core engine.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+        core_engine = new_core_engine(
+            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+        core_engines.append(core_engine)
+        self.core_engine = core_engine
 
     def shutdown(self):
+        # Terminate background resources.
         self._finalizer()
 
+    def _format_exception(self, e: Exception) -> Exception:
+        """If errored, use EngineDeadError so root cause is clear."""
+        return EngineDeadError(
+            suppress_context=True) if self.resources.engine_dead else e
+
+    def ensure_alive(self):
+        if self.resources.engine_dead:
+            raise EngineDeadError()
+
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
@@ -341,7 +482,7 @@ class SyncMPClient(MPClient):
             log_stats=log_stats,
         )
 
-        self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -352,13 +493,14 @@ class SyncMPClient(MPClient):
         outputs_queue = self.outputs_queue
 
         shutdown_path = get_open_zmq_inproc_path()
-        self.resources.shutdown_path = shutdown_path
+        resources = self.resources
+        resources.shutdown_path = shutdown_path
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
-            shutdown_socket.bind(shutdown_path)
             out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
             try:
+                shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
                 poller.register(shutdown_socket)
                 poller.register(out_socket)
@@ -370,13 +512,16 @@ class SyncMPClient(MPClient):
                         # shutdown signal, exit thread.
                         break
 
-                    (frame, ) = out_socket.recv_multipart(copy=False)
-                    outputs = decoder.decode(frame.buffer)
+                    frames = out_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
             finally:
                 # Close sockets.
                 shutdown_socket.close(linger=0)
@@ -389,20 +534,25 @@ class SyncMPClient(MPClient):
         self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.outputs_queue.get()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        outputs = self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
-    def _send_input(self, request_type: EngineCoreRequestType,
-                    request: Any) -> None:
-
-        # (RequestType, SerializedRequest)
-        msg = (request_type.value, self.encoder.encode(request))
+    def _send_input(self, request_type: EngineCoreRequestType, request: Any):
+        self.ensure_alive()
+        # (Identity, RequestType, SerializedRequest)
+        msg = (self.core_engine.identity, request_type.value,
+               *self.encoder.encode(request))
         self.input_socket.send_multipart(msg, copy=False)
 
-    def _call_utility(self, method: str, *args) -> Any:
+    def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
-
         self._send_input(EngineCoreRequestType.UTILITY,
                          (call_id, method, args))
 
@@ -415,38 +565,52 @@ class SyncMPClient(MPClient):
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
-        self._call_utility("profile", is_start)
+        self.call_utility("profile", is_start)
 
     def reset_prefix_cache(self) -> None:
-        self._call_utility("reset_prefix_cache")
+        self.call_utility("reset_prefix_cache")
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self._call_utility("add_lora", lora_request)
+        return self.call_utility("add_lora", lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        return self._call_utility("remove_lora", lora_id)
+        return self.call_utility("remove_lora", lora_id)
 
     def list_loras(self) -> set[int]:
-        return self._call_utility("list_loras")
+        return self.call_utility("list_loras")
 
     def pin_lora(self, lora_id: int) -> bool:
-        return self._call_utility("pin_lora", lora_id)
+        return self.call_utility("pin_lora", lora_id)
 
     def sleep(self, level: int = 1) -> None:
-        self._call_utility("sleep", level)
+        self.call_utility("sleep", level)
 
-    def wake_up(self) -> None:
-        self._call_utility("wake_up")
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.call_utility("wake_up", tags)
 
     def is_sleeping(self) -> bool:
-        return self._call_utility("is_sleeping")
+        return self.call_utility("is_sleeping")
 
     def execute_dummy_batch(self) -> None:
-        self._call_utility("execute_dummy_batch")
+        self.call_utility("execute_dummy_batch")
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.call_utility("collective_rpc", method, timeout, args,
+                                 kwargs)
+
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.call_utility("save_sharded_state", path, pattern, max_size)
 
 
 class AsyncMPClient(MPClient):
@@ -461,56 +625,107 @@ class AsyncMPClient(MPClient):
             log_stats=log_stats,
         )
 
-        self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
-        self.queue_task: Optional[asyncio.Task] = None
+        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
+                                                 Exception]]()
+        try:
+            # If we are running in an asyncio event loop, start the queue task.
+            # Otherwise, it will be started lazily. If it is not started here,
+            # we could miss EXECUTOR_FAILED messages from engine core if they
+            # occur prior to any requests being sent.
+            asyncio.get_running_loop()
+            self._ensure_output_queue_task()
+        except RuntimeError:
+            pass
+
+    def _ensure_output_queue_task(self):
+        resources = self.resources
+        if resources.output_queue_task is not None:
+            return
 
-    async def _start_output_queue_task(self):
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
-        self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
+        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
+                                          Awaitable[None]]] = getattr(
+                                              self.__class__,
+                                              "process_engine_outputs", None)
+        _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
-        self.resources.output_socket = output_socket
+        resources.output_socket = output_socket
 
         async def process_outputs_socket():
-            while True:
-                (frame, ) = await output_socket.recv_multipart(copy=False)
-                outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
-                if outputs.utility_output:
-                    _process_utility_output(outputs.utility_output,
-                                            utility_results)
-                else:
-                    outputs_queue.put_nowait(outputs)
+            try:
+                while True:
+                    frames = await output_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                        continue
 
-        self.queue_task = asyncio.create_task(process_outputs_socket(),
-                                              name="EngineCoreOutputQueueTask")
+                    if output_handler is not None:
+                        assert _self_ref is not None
+                        _self = _self_ref()
+                        if not _self:
+                            # Client has been garbage collected, abort.
+                            return
+                        await output_handler(_self, outputs)
+
+                    if outputs.outputs or outputs.scheduler_stats:
+                        outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
+
+        resources.output_queue_task = asyncio.create_task(
+            process_outputs_socket(), name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
-            assert self.outputs_queue is not None
-        return await self.outputs_queue.get()
+        self._ensure_output_queue_task()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        assert self.outputs_queue is not None
+        outputs = await self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
-    async def _send_input(self, request_type: EngineCoreRequestType,
-                          request: Any) -> None:
+    def _send_input(self,
+                    request_type: EngineCoreRequestType,
+                    request: Any,
+                    engine: Optional[CoreEngine] = None) -> Awaitable[None]:
+        self.ensure_alive()
+        if engine is None:
+            engine = self.core_engine
 
-        msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
+        message = (request_type.value, *self.encoder.encode(request))
+        return self._send_input_message(message, engine)
 
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
+    def _send_input_message(self, message: tuple[bytestr, ...],
+                            engine: CoreEngine) -> Awaitable[None]:
+        self.ensure_alive()
+        message = (engine.identity, ) + message
+        return self.input_socket.send_multipart(message, copy=False)
 
-    async def _call_utility_async(self, method: str, *args) -> Any:
+    async def call_utility_async(self, method: str, *args) -> Any:
+        return await self._call_utility_async(method,
+                                              *args,
+                                              engine=self.core_engine)
+
+    async def _call_utility_async(self, method: str, *args,
+                                  engine: CoreEngine) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        await self._send_input(EngineCoreRequestType.UTILITY,
-                               (call_id, method, args))
-
+        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
+            (call_id, method, args)))
+        await self._send_input_message(message, engine)
+        self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -518,37 +733,176 @@ class AsyncMPClient(MPClient):
         # tokenized.
         request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
+        self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
-        await self._call_utility_async("profile", is_start)
+        await self.call_utility_async("profile", is_start)
 
     async def reset_prefix_cache_async(self) -> None:
-        await self._call_utility_async("reset_prefix_cache")
+        await self.call_utility_async("reset_prefix_cache")
 
     async def sleep_async(self, level: int = 1) -> None:
-        await self._call_utility_async("sleep", level)
+        await self.call_utility_async("sleep", level)
 
-    async def wake_up_async(self) -> None:
-        await self._call_utility_async("wake_up")
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+        await self.call_utility_async("wake_up", tags)
 
     async def is_sleeping_async(self) -> bool:
-        return await self._call_utility_async("is_sleeping")
+        return await self.call_utility_async("is_sleeping")
 
     async def execute_dummy_batch_async(self) -> None:
-        await self._call_utility_async("execute_dummy_batch")
+        await self.call_utility_async("execute_dummy_batch")
 
     async def add_lora_async(self, lora_request: LoRARequest) -> bool:
-        return await self._call_utility_async("add_lora", lora_request)
+        return await self.call_utility_async("add_lora", lora_request)
 
     async def remove_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("remove_lora", lora_id)
+        return await self.call_utility_async("remove_lora", lora_id)
 
     async def list_loras_async(self) -> set[int]:
-        return await self._call_utility_async("list_loras")
+        return await self.call_utility_async("list_loras")
 
     async def pin_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("pin_lora", lora_id)
+        return await self.call_utility_async("pin_lora", lora_id)
+
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        await self.call_utility_async("save_sharded_state", path, pattern,
+                                      max_size)
+
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return await self.call_utility_async("collective_rpc", method, timeout,
+                                             args, kwargs)
+
+
+class DPAsyncMPClient(AsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore."""
+
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
+                 log_stats: bool):
+
+        self.current_wave = 0
+        self.engines_running = False
+        self.reqs_in_flight: dict[str, CoreEngine] = {}
+
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        assert len(self.core_engines) > 1
+
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Launch a core engine for each data parallel rank.
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        for i in range(dp_size):
+            # Multi-node not yet supported so local_dp_rank == dp_rank.
+            core_engines.append(new_core_engine(i, i))
+
+        self.core_engines = core_engines
+
+    async def call_utility_async(self, method: str, *args) -> Any:
+        # Only the result from the first engine is returned.
+        return (await asyncio.gather(*[
+            self._call_utility_async(method, *args, engine=engine)
+            for engine in self.core_engines
+        ]))[0]
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
+        request.current_wave = self.current_wave
+
+        chosen_engine = self.get_core_engine_for_request()
+        self.reqs_in_flight[request.request_id] = chosen_engine
+        chosen_engine.num_reqs_in_flight += 1
+
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
+            # Send request to chosen engine and dp start loop
+            # control message to all other engines.
+            self.engines_running = True
+            to_await = asyncio.gather(
+                to_await,  # type: ignore[assignment]
+                *self._start_wave_coros(exclude_index=chosen_engine.index))
+
+        await to_await
+
+        self._ensure_output_queue_task()
+
+    def get_core_engine_for_request(self) -> CoreEngine:
+        return min(self.core_engines, key=lambda e: e.num_reqs_in_flight)
+
+    @staticmethod
+    async def process_engine_outputs(self: "DPAsyncMPClient",
+                                     outputs: EngineCoreOutputs):
+        if self.reqs_in_flight:
+            for req_id in outputs.finished_requests or ():
+                if engine := self.reqs_in_flight.pop(req_id, None):
+                    engine.num_reqs_in_flight -= 1
+
+        if outputs.wave_complete is not None:
+            # Current wave is complete, move to next wave number
+            # and mark engines as paused.
+            if self.current_wave <= outputs.wave_complete:
+                self.current_wave = outputs.wave_complete + 1
+                self.engines_running = False
+
+        elif outputs.start_wave is not None and (
+                outputs.start_wave > self.current_wave or
+            (outputs.start_wave == self.current_wave
+             and not self.engines_running)):
+            # Engine received request for a non-current wave so we must ensure
+            # that other engines progress to the next wave.
+            self.current_wave = outputs.start_wave
+            self.engines_running = True
+            await asyncio.gather(*self._start_wave_coros(
+                exclude_index=outputs.engine_index))
+
+    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
+        logger.debug("Sending start DP wave %d.", self.current_wave)
+        return [
+            self._send_input(EngineCoreRequestType.START_DP_WAVE,
+                             self.current_wave, engine)
+            for engine in self.core_engines if engine.index != exclude_index
+        ]
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        if not request_ids:
+            return
+
+        if len(request_ids) == 1:
+            # Fast-path common case.
+            if engine := self.reqs_in_flight.get(request_ids[0]):
+                await self._abort_requests(request_ids, engine)
+            return
+
+        by_engine: dict[CoreEngine, list[str]] = {}
+        for req_id in request_ids:
+            if engine := self.reqs_in_flight.get(req_id):
+                by_engine.setdefault(engine, []).append(req_id)
+        for engine, req_ids in by_engine.items():
+            await self._abort_requests(req_ids, engine)
+
+    async def _abort_requests(self, request_ids: list[str],
+                              engine: CoreEngine) -> None:
+        if not self.resources.engine_dead:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                                   engine)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index bf06a17507b21..330a3f6dad90e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
 from typing import Optional
 
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
@@ -12,39 +15,22 @@ from vllm.v1.engine import EngineCoreRequest
 logger = init_logger(__name__)
 
 
-@dataclass
 class IncrementalDetokenizer:
 
-    # Generation data
-    token_ids: list[int]
-    output_text: str = ""
-    tokens: list[str] = field(default_factory=list)
-    prompt_len: int = 0
-
-    # Stop strings
-    stop: list[str] = field(default_factory=list)
-    include_stop_str_in_output: bool = False
-
-    # Metadata for incremental detokenization
-    prefix_offset: int = 0
-    read_offset: int = 0
-
-    # Parameters for detokenization
-    skip_special_tokens: bool = True
-    spaces_between_special_tokens: bool = True
-
-    # Tokenizer for this request,
-    # None if detokenization is disabled.
-    tokenizer: Optional[AnyTokenizer] = None
-
-    # Accounting for stop string buffering
-    stop_buffer_length: int = 0
-    _last_output_text_offset: int = 0
+    def __init__(self):
+        self.token_ids: list[int] = []
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids if not self.prompt_len else (
-            self.token_ids[self.prompt_len:])
+        return self.token_ids
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
 
     @classmethod
     def from_new_request(
@@ -54,39 +40,37 @@ class IncrementalDetokenizer:
     ) -> "IncrementalDetokenizer":
 
         if tokenizer is None:
-            return cls(token_ids=[])
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
 
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
+        if isinstance(tokenizer, PreTrainedTokenizerFast):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+
+        # Stop strings
+        params = request.sampling_params
+        self.stop = stop = params.stop
+        self.include_stop_str_in_output = params.include_stop_str_in_output
 
-        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
         else:
-            stop_buffer_length = 0
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
 
-        return cls(
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            prompt_len=len(request.prompt_token_ids),
-            tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-        )
+        # Generation data
+        self.output_text = ""
 
     def update(self, new_token_ids: list[int],
                stop_terminated: bool) -> Optional[str]:
@@ -98,11 +82,7 @@ class IncrementalDetokenizer:
         Return matched stop string or None.
         """
         if not new_token_ids:
-            # Skip detokenization if no new token ids
-            return None
-        if self.tokenizer is None:
-            # Skip detokenization if no tokenizer
-            self.token_ids.extend(new_token_ids)
+            # Skip detokenization if no new token ids.
             return None
 
         if stop_terminated and not self.include_stop_str_in_output:
@@ -116,34 +96,16 @@ class IncrementalDetokenizer:
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
-        decoded_text = ""
+        offset_before = len(self.output_text)
         for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=self.token_ids,
-                 prev_tokens=self.tokens,
-                 prefix_offset=self.prefix_offset,
-                 read_offset=self.read_offset,
-                 skip_special_tokens=self.skip_special_tokens,
-                 spaces_between_special_tokens=self.
-                 spaces_between_special_tokens,
-             )
-
-            self.tokens.extend(new_tokens)
-            self.prefix_offset = prefix_offset
-            self.read_offset = read_offset
-
-            decoded_text += new_decoded_token_text
-
-        self.output_text += decoded_text
+            self.output_text += self.decode_next(new_token_id)
 
         if stop_terminated:
             if skipped_stop_token_id is not None:
-                # Cleanup after skipping detokenization
+                # Cleanup after skipping detokenization.
                 self.token_ids.append(skipped_stop_token_id)
-            # Stop token triggered; skip stop string check
+            # Stop token triggered; skip stop string check.
             return None
 
         # 2) Evaluate stop strings.
@@ -151,7 +113,7 @@ class IncrementalDetokenizer:
         if self.stop:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
-                new_char_count=len(decoded_text),
+                new_char_count=len(self.output_text) - offset_before,
                 stop=self.stop,
                 include_in_output=self.include_stop_str_in_output,
             )
@@ -162,6 +124,10 @@ class IncrementalDetokenizer:
 
         return stop_string
 
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
     def get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
         this method is returned"""
@@ -177,3 +143,114 @@ class IncrementalDetokenizer:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
         return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        self.stream = DecodeStream(
+            skip_special_tokens=sampling_params.skip_special_tokens)
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Find a safe place to start.
+        prompt_suffix = request.prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, min(prompt_len + 1, 24)):
+                suffix = request.prompt_token_ids[-i:]
+                if '�' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self.stream.step(self.tokenizer, tid)
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self.stream.step(self.tokenizer, next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+
+        # Metadata for incremental detokenization.
+        self.tokens, self.prefix_offset, self.read_offset = (
+            convert_prompt_ids_to_tokens(
+                tokenizer=tokenizer,
+                prompt_ids=request.prompt_token_ids,
+                skip_special_tokens=request.sampling_params.
+                skip_special_tokens,
+            ))
+
+        self.token_ids.extend(request.prompt_token_ids)
+        self.prompt_len = len(request.prompt_token_ids)
+
+        params = request.sampling_params
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
new file mode 100644
index 0000000000000..97dd31d5e5218
--- /dev/null
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7bda3a30d2028..c05319f3d80c6 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,15 +2,16 @@
 
 from collections.abc import Mapping
 from copy import copy
-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union
 
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -31,6 +32,7 @@ from vllm.v1.executor.abstract import Executor
 logger = init_logger(__name__)
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
+_R = TypeVar("_R", default=Any)
 
 
 class LLMEngine:
@@ -43,7 +45,6 @@ class LLMEngine:
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
@@ -60,11 +61,13 @@ class LLMEngine:
         self.cache_config = vllm_config.cache_config
 
         # important: init dp group before init the engine_core
-        self.parallel_config = vllm_config.parallel_config
-        self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        parallel_config = vllm_config.parallel_config
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
         self.should_execute_dummy_batch = False
-        if self.dp_enabled:
-            self.dp_group = self.parallel_config.stateless_init_dp_group()
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -77,7 +80,6 @@ class LLMEngine:
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config=vllm_config,
                                    tokenizer=self.tokenizer,
-                                   input_registry=input_registry,
                                    mm_registry=mm_registry)
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
@@ -148,7 +150,7 @@ class LLMEngine:
 
     def has_unfinished_requests(self) -> bool:
         has_unfinished = self.output_processor.has_unfinished_requests()
-        if not self.dp_enabled:
+        if self.dp_group is None:
             return has_unfinished
         return self.has_unfinished_requests_dp(has_unfinished)
 
@@ -228,6 +230,9 @@ class LLMEngine:
 
         return processed_outputs.request_outputs
 
+    def get_vllm_config(self):
+        return self.vllm_config
+
     def get_model_config(self):
         return self.model_config
 
@@ -243,8 +248,8 @@ class LLMEngine:
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
 
-    def wake_up(self):
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
@@ -280,3 +285,14 @@ class LLMEngine:
     def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 61a55d2499bd1..c765c1bbffcf3 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Optional
 
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.processing import ProcessingCache
+from vllm.utils import is_list_of
 
 # The idea of multimodal preprocessing caching is based on having a client and
 # a server, where the client executes in the frontend process (=P0) and the
@@ -11,9 +14,11 @@ from vllm.multimodal.processing import ProcessingCache
 # -- Client:
 #  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
 #    with built-in caching functionality, with mm_hash as its identifier.
+#  - MirroredProcessingCache to keep track of the cached entries and
+#    determine whether to send the MultiModalKwargs to P1.
 #
 # -- Server:
-#  - MMInputCacheServer to perform caching of the received MultiModalKwargs.
+#  - MirroredProcessingCache to store the MultiModalKwargs from P0.
 #
 # The caching for both client and server is mirrored, and this allows us
 # to avoid the serialization of "mm_inputs" (like pixel values) between
@@ -25,26 +30,48 @@ from vllm.multimodal.processing import ProcessingCache
 # variable VLLM_MM_INPUT_CACHE_GIB.
 
 
-class MMInputCacheServer:
+class MirroredProcessingCache:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)
 
-    def get_and_update(
+    def get_and_update_p0(
         self,
-        mm_inputs: list[MultiModalKwargs],
+        mm_inputs: Sequence[MultiModalKwargs],
         mm_hashes: list[str],
-    ) -> list[MultiModalKwargs]:
+    ) -> Sequence[Optional[MultiModalKwargs]]:
         assert len(mm_inputs) == len(mm_hashes)
 
         if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
             return mm_inputs
 
-        full_mm_inputs = []
+        full_mm_inputs = list[Optional[MultiModalKwargs]]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if self.mm_cache.get(mm_hash) is not None:
+                mm_input = None
+            else:
+                self.mm_cache[mm_hash] = mm_input
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+    def get_and_update_p1(
+        self,
+        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_hashes: list[str],
+    ) -> Sequence[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+
+        full_mm_inputs = list[MultiModalKwargs]()
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
-            assert mm_hash is not None
             if mm_input is None:
                 mm_input = self.mm_cache[mm_hash]
             else:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 1e67bed261182..d652b17e55b35 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -28,32 +28,37 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[RequestOutput] = None
+        self.output: Optional[Union[RequestOutput, Exception]] = None
         self.ready = asyncio.Event()
 
-    def put(self, output: RequestOutput) -> None:
-        if self.output is None:
+    def put(self, output: Union[RequestOutput, Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
             self.output = output
             self.ready.set()
-        elif self.aggregate:
-            # Coalesce the outputs in delta case.
-            self.output.add(output)
-        else:
-            # Just replace latest in non-delta case.
-            self.output = output
+        elif isinstance(self.output, RequestOutput):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
 
     async def get(self) -> RequestOutput:
+        """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
         self.output = None
         self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
     def get_nowait(self) -> Optional[RequestOutput]:
+        """Non-blocking get operation."""
         output = self.output
         if output is not None:
             self.output = None
             self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
 
@@ -235,6 +240,13 @@ class OutputProcessor:
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
     def abort_requests(
         self,
         request_ids: Iterable[str],
@@ -328,7 +340,7 @@ class OutputProcessor:
             # 2) Detokenize the token ids into text and perform stop checks.
             stop_string = req_state.detokenizer.update(
                 new_token_ids, finish_reason == FinishReason.STOP)
-            if stop_string and finish_reason != FinishReason.STOP:
+            if stop_string:
                 finish_reason = FinishReason.STOP
                 stop_reason = stop_string
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index ffd12d5fd0d8f..26c57b31aacd7 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,28 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from collections.abc import Mapping
-from typing import Optional, Union
+from collections.abc import Mapping, Sequence
+from typing import Literal, Optional, Union
 
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
 from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
-from vllm.v1.structured_output.utils import (
-    validate_structured_output_request_xgrammar)
+from vllm.v1.structured_output.backend_xgrammar import (
+    validate_xgrammar_grammar)
 
 
 class Processor:
@@ -31,7 +32,6 @@ class Processor:
         self,
         vllm_config: VllmConfig,
         tokenizer: BaseTokenizerGroup,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
@@ -48,6 +48,8 @@ class Processor:
                                                     self.tokenizer,
                                                     mm_registry)
 
+        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
+
         # Multi-modal hasher (for images)
         self.use_hash = (
             not self.model_config.disable_mm_preprocessor_cache) or \
@@ -75,6 +77,7 @@ class Processor:
         params: SamplingParams,
     ) -> None:
         self._validate_structured_output(params)
+        self._validate_logit_bias(params)
 
         if params.allowed_token_ids is None:
             return
@@ -85,6 +88,26 @@ class Processor:
             raise ValueError(
                 "allowed_token_ids contains out-of-vocab token id!")
 
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
+
     def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
@@ -123,46 +146,59 @@ class Processor:
             return
 
         supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
+
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
                              "supported in V1.")
         if params.guided_decoding.backend:
-            if params.guided_decoding.backend != engine_level_backend:
-                raise ValueError("Request-level structured output backend "
-                                 "must match engine-level backend. "
-                                 f"{params.guided_decoding.backend}"
-                                 f" != {engine_level_backend}")
+            # Request-level backend selection is not supported in V1.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_auto` option set on the backend in the params.
+            if (params.guided_decoding.backend != engine_level_backend
+                    and not (engine_level_backend == "auto" and "_auto"
+                             in params.guided_decoding.backend_options())):
+                raise ValueError(
+                    "Request-level structured output backend selection is no "
+                    "longer supported. The request specified "
+                    f"'{params.guided_decoding.backend}', but vLLM was "
+                    f"initialised with '{engine_level_backend}'. This error "
+                    "can be resolved by removing backend selection from the "
+                    "request.")
         else:
             params.guided_decoding.backend = engine_level_backend
 
         # Request content validation
-
-        if engine_level_backend == "xgrammar":
+        if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
-            validate_structured_output_request_xgrammar(params)
-            params.guided_decoding.backend = "xgrammar"
-        elif engine_level_backend == "auto":
+            validate_xgrammar_grammar(params)
+        elif engine_level_backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
+        else:
+            # NOTE: engine_level_backend must be "auto" here, because we have
+            # checked supported_backends above.
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the
             # default as it is less predictable and subject to change
             # between releases as feature support changes.
             try:
-                validate_structured_output_request_xgrammar(params)
+                validate_xgrammar_grammar(params)
                 params.guided_decoding.backend = "xgrammar"
             except ValueError:
                 # The request includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
-
-        if params.guided_decoding.backend == "guidance":
-            # TODO ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            validate_guidance_grammar(params, tokenizer=None)
+            # Remember that this backend was set automatically
+            params.guided_decoding.add_option("_auto")
 
     def process_inputs(
         self,
@@ -178,7 +214,6 @@ class Processor:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
-
         self._validate_lora(lora_request)
         self._validate_params(params)
         if priority != 0:
@@ -202,18 +237,17 @@ class Processor:
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=self.use_hash,
         )
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         self._validate_model_inputs(processed_inputs, lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = SingletonInputsAdapter(
-                processed_inputs["decoder"])
-            encoder_inputs = SingletonInputsAdapter(
-                processed_inputs["encoder"])
-        else:
-            decoder_inputs = SingletonInputsAdapter(processed_inputs)
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
@@ -224,68 +258,66 @@ class Processor:
         sampling_params = params.clone()
         # If unset max tokens, then generate up to the max_model_len.
         if sampling_params.max_tokens is None:
-            sampling_params.max_tokens = (self.model_config.max_model_len -
-                                          len(decoder_inputs.prompt_token_ids))
+            sampling_params.max_tokens = (
+                self.model_config.max_model_len -
+                len(decoder_inputs["prompt_token_ids"]))
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
         sampling_params.update_from_tokenizer(
             self.tokenizer.get_lora_tokenizer(lora_request))
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
+        sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
-        if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
-            assert isinstance(decoder_mm_inputs, MultiModalKwargs)
-
-            # The output of merged multi-modal processor (`decoder_mm_inputs`)
-            # contains the kwargs for all items from all modalities.
-            # This code separates them so that there is one set of kwargs
-            # per item per modality.
-            individual_mm_inputs = [
-                MultiModalKwargs.from_items([item])
-                for modality in decoder_mm_inputs.modalities
-                for item in decoder_mm_inputs.get_items(modality)
-            ]
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
 
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
-            # NOTE: interleaved modalities are not supported.
             (
-                sorted_modalities,
+                sorted_item_modalities,
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
-                decoder_inputs.multi_modal_placeholders,
-                decoder_inputs.multi_modal_hashes if self.use_hash else None,
+                decoder_inputs["mm_placeholders"],
+                decoder_inputs["mm_hashes"] if self.use_hash else None,
             )
 
-            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
-            # modalities involved.
-            if len(sorted_modalities) > 1:
-                modality_order_dict = {
-                    modality: order
-                    for order, modality in enumerate(sorted_modalities)
-                }
+            # The output of merged multi-modal processor (`decoder_mm_inputs`)
+            # is a single MultiModalKwargs for all items from all modalities.
+            # This code flattens kwargs for individual items in a list and
+            # sorts them by each item's position in the input sequence if there
+            # are multiple modalities.
+            unique_modalities = set(sorted_item_modalities)
+            if len(unique_modalities) > 1:
+                orig_sorted_mm_inputs = []
+                used_indices = {modality: 0 for modality in unique_modalities}
 
-                # Sanity check to make sure each multimodal input has only one
-                # modality key.
-                for mm_input in individual_mm_inputs:
-                    assert len(mm_input.modalities) == 1
+                for modality in sorted_item_modalities:
+                    items = decoder_mm_inputs.get_items(modality)
+                    item = items[used_indices[modality]]
 
-                # Sort MultiModalKwargs to match sorted_mm_positions
-                sorted_mm_inputs = sorted(
-                    individual_mm_inputs,
-                    key=lambda mm_input: modality_order_dict[list(
-                        mm_input.modalities)[0]])
+                    orig_sorted_mm_inputs.append(
+                        MultiModalKwargs.from_items([item]))
+                    used_indices[modality] += 1
             else:
-                sorted_mm_inputs = individual_mm_inputs
+                orig_sorted_mm_inputs = [
+                    MultiModalKwargs.from_items([item]) for item in
+                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
+                ]
+
+            if sorted_mm_hashes is not None:
+                sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
+                    orig_sorted_mm_inputs, sorted_mm_hashes)
+            else:
+                sorted_mm_inputs = orig_sorted_mm_inputs
 
         return EngineCoreRequest(
             request_id=request_id,
-            prompt=decoder_inputs.prompt,
-            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            prompt=decoder_inputs.get("prompt"),
+            prompt_token_ids=decoder_inputs["prompt_token_ids"],
             mm_inputs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
             mm_placeholders=sorted_mm_positions,
@@ -298,42 +330,66 @@ class Processor:
     def _validate_model_inputs(self,
                                inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest] = None):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
-        else:
-            prompt_inputs = inputs
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
 
-        if prompt_ids is None or len(prompt_ids) == 0:
-            raise ValueError("Prompt cannot be empty")
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
 
-        max_input_id = max(prompt_ids)
-        max_allowed = self.tokenizer.get_lora_tokenizer(
-            lora_request).max_token_id
-        if max_input_id > max_allowed:
-            raise ValueError(
-                "Token id {} is out of vocabulary".format(max_input_id))
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
 
-        if len(prompt_ids) >= self.model_config.max_model_len:
-            raise ValueError(
-                f"Prompt length of {len(prompt_ids)} is longer than the "
-                f"maximum model length of {self.model_config.max_model_len}.")
+        prompt_ids = prompt_inputs["prompt_token_ids"]
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        if self.model_config.is_multimodal_model:
-            max_prompt_len = self.model_config.max_model_len
+        max_input_id = max(prompt_ids, default=0)
+        if max_input_id > tokenizer.max_token_id:
+            raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
-            if len(prompt_ids) > max_prompt_len:
-                raise ValueError(
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
-                    f"to fit into the model (context length {max_prompt_len}). "
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
 
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index e3a4cd98c1f81..3b9feb0d32980 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from concurrent.futures import Future
-from typing import Union
+from typing import Callable, Union
 
 import torch
 import torch.distributed as dist
@@ -15,6 +15,8 @@ from vllm.executor.uniproc_executor import (  # noqa
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
+FailureCallback = Callable[[], None]
+
 
 class Executor(ExecutorBase):
     """
@@ -62,6 +64,13 @@ class Executor(ExecutorBase):
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
+    def register_failure_callback(self, callback: FailureCallback):
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
     def determine_available_memory(self) -> list[int]:  # in bytes
         output = self.collective_rpc("determine_available_memory")
         return output
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 21e7d26506d3f..cb125bf4bf173 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import multiprocessing
 import os
 import pickle
 import signal
 import sys
+import threading
 import time
 import traceback
 import weakref
+from concurrent.futures import Future
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
+from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
-from typing import Any, Callable, Optional, Union
+from threading import Thread
+from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
-import psutil
-import zmq
 
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
@@ -26,8 +28,9 @@ from vllm.executor.multiproc_worker_utils import (
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
-                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
-from vllm.v1.executor.abstract import Executor
+                        get_open_port)
+from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -35,6 +38,8 @@ logger = init_logger(__name__)
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
+EXECUTE_MODEL_TIMEOUT_S = 40
+
 
 class MultiprocExecutor(Executor):
 
@@ -42,19 +47,9 @@ class MultiprocExecutor(Executor):
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
-
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen.
-        def sigusr1_handler(signum, frame):
-            logger.fatal(
-                "MulitprocExecutor got fatal signal from worker processes, "
-                "shutting down. See stack trace above for root cause issue.")
-            # Propagate error up to parent process.
-            parent_process = psutil.Process().parent()
-            parent_process.send_signal(signal.SIGUSR1)
-            self.shutdown()
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        self.is_failed = False
+        self.shutdown_event = threading.Event()
+        self.failure_callback: Optional[FailureCallback] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -78,26 +73,92 @@ class MultiprocExecutor(Executor):
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-        self.workers: list[WorkerProcHandle] = []
-        for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
-                                                    rank,
-                                                    distributed_init_method,
-                                                    scheduler_output_handle)
-            self.workers.append(worker)
+        unready_workers: list[UnreadyWorkerProcHandle] = []
+        success = False
+        try:
+            for rank in range(self.world_size):
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=rank,
+                        rank=rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                    ))
 
-        # Ensure message queues are ready. Will deadlock if re-ordered
-        # Must be kept consistent with the WorkerProc
-        self.rpc_broadcast_mq.wait_until_ready()
-        for w in self.workers:
-            w.worker_response_mq.wait_until_ready()
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+            self.rpc_broadcast_mq.wait_until_ready()
+            for w in self.workers:
+                w.worker_response_mq.wait_until_ready()
+
+            self.start_worker_monitor()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                self._ensure_worker_termination(
+                    [w.proc for w in unready_workers])
+
+    def start_worker_monitor(self):
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, 'shutting_down', False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers
+                             if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, "
+                "shutting down executor.", proc_name)
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        Thread(target=monitor_workers,
+               daemon=True,
+               name="MultiprocWorkerMonitor").start()
+
+    def register_failure_callback(self, callback: FailureCallback):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        (output, ) = self.collective_rpc("execute_model",
+                                         args=(scheduler_output, ),
+                                         rank0_reply_only=True,
+                                         timeout=EXECUTE_MODEL_TIMEOUT_S)
+        return output
 
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
-        start_time = time.monotonic()
+                       kwargs: Optional[dict] = None,
+                       rank0_reply_only: bool = False) -> list[Any]:
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
+        deadline = None if timeout is None else time.monotonic() + timeout
         kwargs = kwargs or {}
 
         # NOTE: If the args are heterogeneous, then we pack them into a list,
@@ -109,31 +170,30 @@ class MultiprocExecutor(Executor):
             else:
                 send_method = cloudpickle.dumps(
                     method, protocol=pickle.HIGHEST_PROTOCOL)
-            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
+            self.rpc_broadcast_mq.enqueue(
+                (send_method, args, kwargs, rank0_reply_only))
 
-            responses = [None] * self.world_size
-            for w in self.workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time
-                                             ) if timeout is not None else None
+            workers = (self.workers[0], ) if rank0_reply_only else self.workers
+            responses = [None] * len(workers)
+            for w in workers:
+                dequeue_timeout = None if deadline is None else (
+                    deadline - time.monotonic())
                 status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout)
+                    timeout=dequeue_timeout, cancel=self.shutdown_event)
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
-                    if isinstance(result, Exception):
-                        raise result
-                    else:
-                        raise RuntimeError("Worker failed")
+                    raise RuntimeError(
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause")
 
                 responses[w.rank] = result
 
             return responses
         except TimeoutError as e:
             raise TimeoutError(f"RPC call to {method} timed out.") from e
-        except Exception as e:
-            # Re-raise any other exceptions
-            raise e
 
-    def _ensure_worker_termination(self):
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
         termination and kill signals if needed."""
@@ -151,7 +211,7 @@ class MultiprocExecutor(Executor):
             return False
 
         # Send SIGTERM if still running
-        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
         for p in active_procs:
             p.terminate()
         if not wait_for_termination(active_procs, 4):
@@ -160,22 +220,14 @@ class MultiprocExecutor(Executor):
             for p in active_procs:
                 p.kill()
 
-        self._cleanup_sockets()
-
-    def _cleanup_sockets(self):
-        for w in self.workers:
-            # Remove the zmq ipc socket file
-            socket_path = w.ready_path.replace("ipc://", "")
-            if os and os.path.exists(socket_path):
-                os.remove(socket_path)
-
     def shutdown(self):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
+            self.shutdown_event.set()
             for w in self.workers:
                 w.worker_response_mq = None
-            self._ensure_worker_termination()
+            self._ensure_worker_termination([w.proc for w in self.workers])
 
         self.rpc_broadcast_mq = None
 
@@ -184,13 +236,30 @@ class MultiprocExecutor(Executor):
         return
 
 
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Connection
+
+
 @dataclass
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    ready_path: str
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
 
+    @classmethod
+    def from_unready_handle(
+            cls, unready_handle: UnreadyWorkerProcHandle,
+            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+        )
+
 
 class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
@@ -204,7 +273,6 @@ class WorkerProc:
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_path: str,
     ):
         self.rank = rank
         wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
@@ -232,15 +300,8 @@ class WorkerProc:
 
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
-        worker_response_mq_handle = self.worker_response_mq.export_handle()
-
-        # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
-            payload = pickle.dumps(worker_response_mq_handle,
-                                   protocol=pickle.HIGHEST_PROTOCOL)
-            ready_socket.send_string(WorkerProc.READY_STR)
-            ready_socket.send(payload)
 
+        # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
 
@@ -251,12 +312,10 @@ class WorkerProc:
             rank: int,
             distributed_init_method: str,
             input_shm_handle,  # Receive SchedulerOutput
-    ) -> WorkerProcHandle:
+    ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-
-        # ZMQ path for worker to send ready message and shm_broadcast handle
-        # back to core process.
-        ready_path = get_open_zmq_ipc_path()
+        # (reader, writer)
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -264,22 +323,57 @@ class WorkerProc:
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_path": ready_path,
+            "ready_pipe": (reader, writer),
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
+                               name=f"VllmWorker-{rank}",
                                daemon=True)
+
         proc.start()
+        writer.close()
+        return UnreadyWorkerProcHandle(proc, rank, reader)
 
-        # Wait for startup
-        worker_response_mq_handle = WorkerProc.wait_for_startup(
-            proc, ready_path)
+    @staticmethod
+    def wait_for_ready(
+        unready_proc_handles: list[UnreadyWorkerProcHandle]
+    ) -> list[WorkerProcHandle]:
 
-        worker_response_mq = MessageQueue.create_from_handle(
-            worker_response_mq_handle, 0)
+        e = Exception("WorkerProc initialization failed due to "
+                      "an exception in a background process. "
+                      "See stack trace for root cause.")
 
-        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
+            [None] * len(unready_proc_handles))
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    # Extract the message queue handle.
+                    worker_response_mq = MessageQueue.create_from_handle(
+                        response["handle"], 0)
+                    ready_proc_handles[unready_proc_handle.rank] = (
+                        WorkerProcHandle.from_unready_handle(
+                            unready_proc_handle, worker_response_mq))
+
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -308,52 +402,51 @@ class WorkerProc:
         signal.signal(signal.SIGINT, signal_handler)
 
         worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
         try:
+            reader.close()
             worker = WorkerProc(*args, **kwargs)
 
+            # Send READY once we know everything is loaded
+            ready_writer.send({
+                "status":
+                WorkerProc.READY_STR,
+                "handle":
+                worker.worker_response_mq.export_handle(),
+            })
+
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
             worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
 
             worker.worker_busy_loop()
 
-        except SystemExit:
-            logger.debug("Worker interrupted.")
-
         except Exception:
-            # worker_busy_loop sends exceptions exceptons to Executor
-            # for shutdown, but if there is an error in startup or an
-            # error with IPC itself, we need to alert the parent.
-            psutil.Process().parent().send_signal(signal.SIGUSR1)
-            raise
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            else:
+                logger.exception("WorkerProc failed.")
+
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
+            shutdown_requested = True
 
         finally:
+            if ready_writer is not None:
+                ready_writer.close()
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
-                worker = None
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
-    ) -> Optional[Handle]:
-        """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
-
-            # Wait for Worker to send READY.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for WorkerProc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("WorkerProc failed to start.")
-
-            message = socket.recv_string()
-            assert message == WorkerProc.READY_STR
-            handle_frame = socket.recv(copy=False)
-            handle = pickle.loads(handle_frame.buffer)
-            return handle
 
     class ResponseStatus(Enum):
         SUCCESS = auto()
@@ -362,7 +455,7 @@ class WorkerProc:
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
-            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+            method, args, kwargs, rank0_only = self.rpc_broadcast_mq.dequeue()
 
             try:
                 if isinstance(method, str):
@@ -374,10 +467,14 @@ class WorkerProc:
                 # Notes have been introduced in python 3.11
                 if hasattr(e, "add_note"):
                     e.add_note(traceback.format_exc())
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.FAILURE, e))
-                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
+                logger.exception("WorkerProc hit an exception.")
+                # exception might not be serializable, so we convert it to
+                # string, only for logging purpose.
+                if not rank0_only or self.rank == 0:
+                    self.worker_response_mq.enqueue(
+                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
                 continue
 
-            self.worker_response_mq.enqueue(
-                (WorkerProc.ResponseStatus.SUCCESS, output))
+            if not rank0_only or self.rank == 0:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 867b1b61c8799..4fc0844cd1f4d 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import cdiv, get_dtype_size
 
@@ -43,28 +44,23 @@ class KVCacheSpec:
         """
         raise NotImplementedError
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         """
-        The KV cache size for `num_tokens` tokens in bytes. Returns the real
-        memory size after padding `num_tokens` to full blocks.
+        The maximum possible memory usage of this KV cache in bytes.
 
         Returns:
-            The KV cache size
+            The KV cache size in bytes
         """
         raise NotImplementedError
 
 
 @dataclass
-class FullAttentionSpec(KVCacheSpec):
+class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
     use_mla: bool
 
-    @property
-    def type_id(self) -> str:
-        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
-
     @property
     def page_size_bytes(self) -> int:
         # For MLA we only store a single latent vector
@@ -72,8 +68,47 @@ class FullAttentionSpec(KVCacheSpec):
         return coef * self.block_size * self.num_kv_heads * self.head_size \
                 * get_dtype_size(self.dtype)
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
-        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+@dataclass
+class FullAttentionSpec(AttentionSpec):
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+
+@dataclass
+class SlidingWindowSpec(AttentionSpec):
+    sliding_window: int
+
+    def __post_init__(self):
+        assert not self.use_mla, "MLA is not supported for sliding window"
+
+    @property
+    def type_id(self) -> str:
+        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        max_num_batched_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+
+        # During chunked prefill, we allocate KV cache for the last
+        # `self.sliding_window-1` computed tokens plus the newly scheduled
+        # tokens. And we won't allocate KV cache for more than `max_model_len`
+        # tokens.
+        num_tokens = min(self.sliding_window - 1 + max_num_batched_tokens,
+                         max_model_len)
+
+        # +1 here because the sliding window may not start from the beginning
+        # of the block. For example, if the block size is 4 and num_token
+        # is 4, we need two blocks [XXCD] [EF] to store the sliding
+        # window [CDEF] of 6 tokens.
+        return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
 
 
 @dataclass
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index fcb4d4f5a25a6..4d70f27f8080c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -12,6 +12,7 @@ from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
 
 logger = init_logger(__name__)
 
@@ -31,12 +32,16 @@ class StatLoggerBase(ABC):
 
 class LoggingStatLogger(StatLoggerBase):
 
-    def __init__(self):
+    def __init__(self, engine_index: int = 0):
+        self.engine_index = engine_index
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.spec_decoding_metrics = SpecDecodingMetrics()
+        self.last_prompt_throughput: float = 0.0
+        self.last_generation_throughput: float = 0.0
 
     def _reset(self, now):
         self.last_log_time = now
@@ -64,6 +69,10 @@ class LoggingStatLogger(StatLoggerBase):
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.observe(
+                scheduler_stats.spec_decoding_stats)
+
         self.last_scheduler_stats = scheduler_stats
 
     def log(self):
@@ -76,13 +85,24 @@ class LoggingStatLogger(StatLoggerBase):
 
         scheduler_stats = self.last_scheduler_stats
 
+        log_fn = logger.info
+        if not any(
+            (prompt_throughput, generation_throughput,
+             self.last_prompt_throughput, self.last_generation_throughput)):
+            # Avoid log noise on an idle production system
+            log_fn = logger.debug
+        self.last_generation_throughput = generation_throughput
+        self.last_prompt_throughput = prompt_throughput
+
         # Format and print output.
-        logger.info(
+        log_fn(
+            "Engine %03d: "
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
             "Running: %d reqs, Waiting: %d reqs, "
             "GPU KV cache usage: %.1f%%, "
             "Prefix cache hit rate: %.1f%%",
+            self.engine_index,
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
@@ -91,10 +111,13 @@ class LoggingStatLogger(StatLoggerBase):
             self.prefix_caching_metrics.hit_rate * 100,
         )
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.log(log_fn=log_fn)
+
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, vllm_config: VllmConfig):
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
 
         # Use this flag to hide metrics that were deprecated in
@@ -102,8 +125,11 @@ class PrometheusStatLogger(StatLoggerBase):
         self.show_hidden_metrics = \
             vllm_config.observability_config.show_hidden_metrics
 
-        labelnames = ["model_name"]
-        labelvalues = [vllm_config.model_config.served_model_name]
+        labelnames = ["model_name", "engine"]
+        labelvalues = [
+            vllm_config.model_config.served_model_name,
+            str(engine_index)
+        ]
 
         max_model_len = vllm_config.model_config.max_model_len
 
@@ -224,7 +250,8 @@ class PrometheusStatLogger(StatLoggerBase):
                 documentation="Histogram of time to first token in seconds.",
                 buckets=[
                     0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0,
+                    640.0, 2560.0
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
@@ -234,13 +261,13 @@ class PrometheusStatLogger(StatLoggerBase):
                 documentation="Histogram of time per output token in seconds.",
                 buckets=[
                     0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
-                    0.75, 1.0, 2.5
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
         ]
         self.histogram_e2e_time_request = \
             prometheus_client.Histogram(
@@ -296,6 +323,24 @@ class PrometheusStatLogger(StatLoggerBase):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Speculative Decoding metrics
+        # The acceptance rate can be calculated using a PromQL query:
+        #
+        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+        #
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
         #
         # Cache config info metric
         #
@@ -332,6 +377,12 @@ class PrometheusStatLogger(StatLoggerBase):
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.counter_spec_decode_num_draft_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_draft_tokens)
+            self.counter_spec_decode_num_accepted_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+
         if iteration_stats is None:
             return
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 6f3d344474260..fd949264885b5 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -4,6 +4,8 @@ import time
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
     from vllm.v1.engine.output_processor import RequestState
@@ -35,6 +37,8 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(
         default_factory=PrefixCacheStats)
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 @dataclass
 class LoRAStats:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index efb5a54d12077..6be72431dde52 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,17 +3,16 @@
 import enum
 from typing import TYPE_CHECKING, Optional, Union
 
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_list_of
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
 from vllm.v1.structured_output.request import StructuredOutputRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.inputs import PlaceholderRange
 
 
 class Request:
@@ -23,9 +22,9 @@ class Request:
         request_id: str,
         prompt: Optional[str],
         prompt_token_ids: list[int],
-        multi_modal_inputs: Optional[list["MultiModalKwargs"]],
+        multi_modal_inputs: Optional[list[MultiModalKwargs]],
         multi_modal_hashes: Optional[list[str]],
-        multi_modal_placeholders: Optional[list["PlaceholderRange"]],
+        multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
@@ -59,6 +58,8 @@ class Request:
         self.mm_positions = multi_modal_placeholders or []
         self.mm_inputs = multi_modal_inputs or []
         self.mm_hashes: list[str] = multi_modal_hashes or []
+        self.num_encoder_inputs = len(self.mm_inputs)
+        self.has_encoder_inputs = self.num_encoder_inputs > 0
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
@@ -73,6 +74,11 @@ class Request:
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+        if request.mm_inputs is not None:
+            assert isinstance(request.mm_inputs, list)
+            assert is_list_of(request.mm_inputs, MultiModalKwargs), (
+                "mm_inputs was not updated in EngineCore.add_request")
+
         return cls(
             request_id=request.request_id,
             prompt=request.prompt,
@@ -93,9 +99,11 @@ class Request:
         token_ids: Union[int, list[int]],
     ) -> None:
         if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        self._output_token_ids.extend(token_ids)
-        self._all_token_ids.extend(token_ids)
+            self._output_token_ids.append(token_ids)
+            self._all_token_ids.append(token_ids)
+        else:
+            self._output_token_ids.extend(token_ids)
+            self._all_token_ids.extend(token_ids)
 
     @property
     def num_tokens(self) -> int:
@@ -115,16 +123,9 @@ class Request:
     def get_finished_reason(self) -> Union[FinishReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
-    def has_encoder_inputs(self) -> bool:
-        return len(self.mm_inputs) > 0
-
-    @property
-    def num_encoder_inputs(self) -> int:
-        return len(self.mm_positions)
-
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id]["length"]
+        num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
     @property
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 1dea711874bfd..745b81ded3f11 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -19,6 +19,12 @@ except ImportError:
 
 
 class TopKTopPSampler(nn.Module):
+    """
+    Module that performs optional top-k and top-p filtering followed by
+    weighted random sampling of logits.
+
+    Implementations may update the logits tensor in-place.
+    """
 
     def __init__(self):
         super().__init__()
@@ -66,14 +72,7 @@ class TopKTopPSampler(nn.Module):
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         elif current_platform.is_tpu():
-            if envs.VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION:
-                logger.warning(
-                    "TPU-specific optimization for top-k & top-p sampling are "
-                    "disabled, falling back to PyTorch-native implementation "
-                    "which could be very slow.")
-                self.forward = self.forward_native
-            else:
-                self.forward = self.forward_tpu
+            self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
 
@@ -84,7 +83,11 @@ class TopKTopPSampler(nn.Module):
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        """PyTorch-native implementation of top-k and top-p sampling."""
+        """
+        PyTorch-native implementation of top-k and top-p sampling.
+
+        The logits tensor may be updated in-place.
+        """
         logits = apply_top_k_top_p(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
@@ -112,23 +115,58 @@ class TopKTopPSampler(nn.Module):
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        # If only top-k is specified, use pytorch's builtin topk op. This leads
-        # to significant speed up on TPU compared to using apply_top_k_top_p.
-        if k is not None and p is None:
-            topk_values, topk_indices = torch.topk(logits, k, dim=-1)
-
-            mask = torch.ones_like(logits, dtype=torch.bool)
-            mask.scatter_(-1, topk_indices, False)
-            logits.masked_fill_(mask, float('-inf'))
-        else:
-            # TODO Placeholder for TPU optimized topp kernel
-            # logits = apply_top_k_top_p(logits, k, p)
-            pass
-
+        logits = apply_top_k_top_p_tpu(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 
 
+def apply_top_k_top_p_tpu(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
+    if k is not None:
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    if p is not None:
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
@@ -136,10 +174,18 @@ def apply_top_k_top_p(
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
-    This function sorts the logits tensor, which can be slow for large batches.
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
     """
-    if k is None and p is None:
-        return logits
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
     logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
 
     if k is not None:
@@ -153,7 +199,7 @@ def apply_top_k_top_p(
     if p is not None:
         # Apply top-p.
         probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
         top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
         # at least one
         top_p_mask[:, -1] = False
@@ -164,6 +210,31 @@ def apply_top_k_top_p(
     return logits
 
 
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k mask to the logits.
+
+    This implementation doesn't involve sorting the entire vocab.
+
+    The logits tensor may be updated in-place.
+    """
+    no_top_k_mask = k == logits.shape[1]
+    # Set non-top-k rows to 1 so that we can gather.
+    k = k.masked_fill(no_top_k_mask, 1)
+    max_top_k = k.max()
+    # topk.values tensor has shape [batch_size, max_top_k].
+    # Convert top k to 0-based index in range [0, max_top_k).
+    k_index = k.sub_(1).unsqueeze(1)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
+    # Handle non-topk rows.
+    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
+    logits.masked_fill_(logits < top_k_mask, -float("inf"))
+    return logits
+
+
 def random_sample(
     probs: torch.Tensor,
     generators: dict[int, torch.Generator],
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 69bc68174d504..3cf7fde5cd0ec 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -109,6 +109,18 @@ class RejectionSampler(nn.Module):
         output_token_ids: torch.Tensor,
         vocab_size: int,
     ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
         output_token_ids_np = output_token_ids.cpu().numpy()
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 397a049dc2543..16561d30a6dc3 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -87,6 +87,12 @@ class Sampler(nn.Module):
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
+        """Sample logits based on sampling metadata.
+
+        The various logits processing functions called in this method
+        may update the logits tensor in-place.
+        """
+
         assert not (sampling_metadata.all_greedy
                     and sampling_metadata.all_random)
         if sampling_metadata.all_random:
@@ -224,9 +230,19 @@ class Sampler(nn.Module):
         # TODO(houseroad): this implementation is extremely inefficient.
         # One idea is implement this as a PyTorch C++ op, and we may
         # even optimize the logit_bias layout.
+
+        # Get vocabulary size from logits
+        vocab_size = logits.shape[-1]
+
         for i, logit_bias in enumerate(sampling_metadata.logit_bias):
             if logit_bias:
                 for token_id, bias in logit_bias.items():
+                    # Check token_id bounds to ensure within vocabulary
+                    if token_id < 0 or token_id >= vocab_size:
+                        raise ValueError(
+                            f"token_id {token_id} in logit_bias contains "
+                            f"out-of-vocab token id. Vocabulary size: "
+                            f"{vocab_size}")
                     logits[i, token_id] += bias
         return logits
 
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index d605c4b65e9d3..d4ea8c2dee071 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
-import torch_xla.core.xla_model as xm
 
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -11,8 +10,8 @@ DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
     min_p=0.0,
     # strictly disabled for now
-    # top_k=-1,
-    # top_p=0.0,
+    top_k=0,
+    top_p=1.0,
     # frequency_penalties=0.0,
     # presence_penalties=0.0,
     # repetition_penalties=0.0,
@@ -24,19 +23,13 @@ class TPUSupportedSamplingMetadata:
     # This class exposes a more xla-friendly interface than SamplingMetadata
     # on TPU, in particular all arguments should be traceable and no optionals
     # are allowed, to avoid graph recompilation on Nones.
-    temperature: torch.Tensor
+    temperature: torch.Tensor = None
 
-    min_p: torch.Tensor
-    # Still too slow on forward_native!
+    min_p: torch.Tensor = None
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
-    # Greedy sampling flag for compiling single xla graph.
-    all_greedy: torch.Tensor = None
-
-    # Generator not supported by xla
-    generators: dict[int,
-                     torch.Generator] = field(default_factory=lambda: dict())
+    all_greedy: bool = True
 
     # unsupported, you need to return an extra tensor of static size BxV
     max_num_logprobs = None
@@ -57,70 +50,69 @@ class TPUSupportedSamplingMetadata:
 
     allowed_token_ids_mask = None
     bad_words_token_ids = None
-    indices_do_sample: torch.Tensor = None
+
+    # Generator not supported by xla
+    _generators: dict[int,
+                      torch.Generator] = field(default_factory=lambda: dict())
+
+    @property
+    def generators(self) -> dict[int, torch.Generator]:
+        # Generator not supported by torch/xla. This field must be immutable.
+        return self._generators
 
     @classmethod
     def from_input_batch(
-            cls, input_batch: InputBatch,
-            indices_do_sample: torch.Tensor) -> "TPUSupportedSamplingMetadata":
+        cls,
+        input_batch: InputBatch,
+        padded_num_reqs: int,
+        xla_device: torch.device,
+        generate_params_if_all_greedy: bool = False
+    ) -> "TPUSupportedSamplingMetadata":
         """
         Copy sampling tensors slices from `input_batch` to on device tensors.
 
         `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
         slices dynamic shapes on device tensors. This impl moves the dynamic 
-        ops to CPU and produces tensors of fixed `padded_num_reqs` size. It 
-        also reuses the on-device persistent tensors managed in `input_batch`
-        to reduce waste. 
+        ops to CPU and produces tensors of fixed `padded_num_reqs` size.
 
-        `indices_do_sample` contains the indices to be fed to the  Sampler, 
-        normally one per request, here padded to the closest pre-compiled shape
-        We expect sampling params tensors to be padded to the same fixed shape.
-
-        Eg. 3 requests, tensors padded to 4 
-            temperature: [0.7, 0.2, 0.9]=>[0.7, 0.2, 0.9, 0.0]
-            sample indices: [4, 10, 11]=>indices_do_sample: [4, 10, 11, 0]
+        Args:
+            input_batch: The input batch containing sampling parameters.
+            padded_num_reqs: The padded number of requests.
+            xla_device: The XLA device.
+            generate_params_if_all_greedy: If True, generate sampling parameters
+                even if all requests are greedy. this is useful for cases where
+                we want to pre-compile a graph with sampling parameters, even if
+                they are not strictly needed for greedy decoding.
         """
-        num_reqs = input_batch.num_reqs
-        padded_num_reqs = len(indices_do_sample)
+        # Early return to avoid unnecessary cpu to tpu copy
+        if (input_batch.all_greedy is True
+                and generate_params_if_all_greedy is False):
+            return cls(all_greedy=True)
 
-        def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
-                       fill_val) -> torch.Tensor:
-            # Copy slice from CPU to corresponding TPU pre-allocated tensor.
+        num_reqs = input_batch.num_reqs
+
+        def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
-            tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
 
-        # NOTE NickLucche The sync CPU-TPU graph we produce here must be
-        # consistent. We can't have flags to skip copies or we'll end up
-        # recompiling.
-        copy_slice(input_batch.temperature_cpu_tensor, input_batch.temperature,
+        fill_slice(input_batch.temperature_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["temperature"])
-        # TODO Temporarily disabled until sampling options are enabled
-        # copy_slice(input_batch.top_p_cpu_tensor, input_batch.top_p)
-        # copy_slice(input_batch.top_k_cpu_tensor, input_batch.top_k)
-        copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
+        fill_slice(input_batch.min_p_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
-
-        # copy_slice(input_batch.frequency_penalties_cpu_tensor,
-        #             input_batch.frequency_penalties)
-        # copy_slice(input_batch.presence_penalties_cpu_tensor,
-        #             input_batch.presence_penalties)
-        # copy_slice(input_batch.repetition_penalties_cpu_tensor,
-        #             input_batch.repetition_penalties)
-
-        xm.mark_step()
-        xm.wait_device_ops()
+        fill_slice(input_batch.top_k_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_k"])
+        fill_slice(input_batch.top_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
-            temperature=input_batch.temperature[:padded_num_reqs],
-            # Scalar tensor for xla-friendly tracing.
-            all_greedy=torch.tensor(input_batch.all_greedy,
-                                    dtype=torch.bool,
-                                    device=input_batch.device),
+            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
+            to(xla_device),
+            all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
-            top_p=None,  # input_batch.top_p[:padded_num_reqs],
-            top_k=None,  # input_batch.top_k[:padded_num_reqs],
-            min_p=input_batch.min_p[:padded_num_reqs],
-            generators=input_batch.generators,
-            indices_do_sample=indices_do_sample)
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device))
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3f000abcde0d1..a3ad8cb920962 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,53 +1,274 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import dataclasses
 import pickle
-from typing import Any, Optional
+from collections.abc import Sequence
+from inspect import isclass
+from types import FunctionType
+from typing import Any, Optional, Union
 
+import cloudpickle
+import numpy as np
 import torch
+import zmq
 from msgspec import msgpack
 
-CUSTOM_TYPE_TENSOR = 1
-CUSTOM_TYPE_PICKLE = 2
+from vllm import envs
+from vllm.multimodal.inputs import (BaseMultiModalField,
+                                    MultiModalBatchedField,
+                                    MultiModalFieldConfig, MultiModalFieldElem,
+                                    MultiModalFlatField, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+
+CUSTOM_TYPE_PICKLE = 1
+CUSTOM_TYPE_CLOUDPICKLE = 2
+CUSTOM_TYPE_RAW_VIEW = 3
+
+# MultiModalField class serialization type map.
+# These need to list all possible field types and match them
+# to factory methods in `MultiModalFieldConfig`.
+MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
+    MultiModalFlatField: "flat",
+    MultiModalSharedField: "shared",
+    MultiModalBatchedField: "batched",
+}
+
+bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
 
 class MsgpackEncoder:
-    """Encoder with custom torch tensor serialization."""
+    """Encoder with custom torch tensor and numpy array serialization.
 
-    def __init__(self):
-        self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
+    Note that unlike vanilla `msgspec` Encoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
 
-    def encode(self, obj: Any) -> bytes:
-        return self.encoder.encode(obj)
+    By default, arrays below 256B are serialized inline Larger will get sent 
+    via dedicated messages. Note that this is a per-tensor limit.
+    """
 
-    def encode_into(self, obj: Any, buf: bytearray) -> None:
-        self.encoder.encode_into(obj, buf)
+    def __init__(self, size_threshold: Optional[int] = None):
+        if size_threshold is None:
+            size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
+        self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
+        # This is used as a local stash of buffers that we can then access from
+        # our custom `msgspec` hook, `enc_hook`. We don't have a way to
+        # pass custom data to the hook otherwise.
+        self.aux_buffers: Optional[list[bytestr]] = None
+        self.size_threshold = size_threshold
+
+    def encode(self, obj: Any) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = bufs = [b'']
+            bufs[0] = self.encoder.encode(obj)
+            # This `bufs` list allows us to collect direct pointers to backing
+            # buffers of tensors and np arrays, and return them along with the
+            # top-level encoded buffer instead of copying their data into the
+            # new buffer.
+            return bufs
+        finally:
+            self.aux_buffers = None
+
+    def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = [buf]
+            bufs = self.aux_buffers
+            self.encoder.encode_into(obj, buf)
+            return bufs
+        finally:
+            self.aux_buffers = None
+
+    def enc_hook(self, obj: Any) -> Any:
+        if isinstance(obj, torch.Tensor):
+            return self._encode_tensor(obj)
+
+        # Fall back to pickle for object or void kind ndarrays.
+        if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
+            return self._encode_ndarray(obj)
+
+        if isinstance(obj, MultiModalKwargs):
+            mm: MultiModalKwargs = obj
+            if not mm.modalities:
+                # just return the main dict if there are no modalities.
+                return dict(mm)
+
+            # ignore the main dict, it will be re-indexed.
+            # Encode a list of MultiModalKwargsItems as plain dicts
+            # + special handling for .field.
+            # Any tensors *not* indexed by modality will be ignored.
+            return [[{
+                "modality": elem.modality,
+                "key": elem.key,
+                "data": self._encode_nested_tensors(elem.data),
+                "field": self._encode_mm_field(elem.field),
+            } for elem in item.values()]
+                    for itemlist in mm._items_by_modality.values()
+                    for item in itemlist]
+
+        if isinstance(obj, FunctionType):
+            # `pickle` is generally faster than cloudpickle, but can have
+            # problems serializing methods.
+            return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+
+        return msgpack.Ext(CUSTOM_TYPE_PICKLE,
+                           pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
+
+    def _encode_ndarray(
+        self, obj: np.ndarray
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # If the array is non-contiguous, we need to copy it first
+        arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
+        if not obj.shape or obj.nbytes < self.size_threshold:
+            # Encode small arrays and scalars inline. Using this extension type
+            # ensures we can avoid copying when decoding.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr_data)
+
+        # We serialize the ndarray as a tuple of native types.
+        # The data is either inlined if small, or an index into a list of
+        # backing buffers that we've stashed in `aux_buffers`.
+        return obj.dtype.str, obj.shape, data
+
+    def _encode_tensor(
+        self, obj: torch.Tensor
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # this creates a copy of the tensor if it's not already contiguous
+        obj = obj.contiguous()
+        #  view the tensor as a 1D array of bytes
+        arr = obj.view((obj.numel(), )).view(torch.uint8).numpy()
+        if obj.nbytes < self.size_threshold:
+            # Smaller tensors are encoded inline, just like ndarrays.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr.data)
+        dtype = str(obj.dtype)[6:]  # remove 'torch.' prefix
+        return dtype, obj.shape, data
+
+    def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
+        if isinstance(nt, torch.Tensor):
+            return self._encode_tensor(nt)
+        if isinstance(nt, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return nt
+        return [self._encode_nested_tensors(x) for x in nt]
+
+    def _encode_mm_field(self, field: BaseMultiModalField):
+        # Figure out the factory name for the field type.
+        name = MMF_CLASS_TO_FACTORY.get(field.__class__)
+        if not name:
+            raise TypeError(f"Unsupported field type: {field.__class__}")
+        # We just need to copy all of the field values in order
+        # which will be then used to reconstruct the field.
+        field_values = (getattr(field, f.name)
+                        for f in dataclasses.fields(field))
+        return name, *field_values
 
 
 class MsgpackDecoder:
-    """Decoder with custom torch tensor serialization."""
+    """Decoder with custom torch tensor and numpy array serialization.
+
+    Note that unlike vanilla `msgspec` Decoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
+    """
 
     def __init__(self, t: Optional[Any] = None):
         args = () if t is None else (t, )
-        self.decoder = msgpack.Decoder(*args, ext_hook=custom_ext_hook)
+        self.decoder = msgpack.Decoder(*args,
+                                       ext_hook=self.ext_hook,
+                                       dec_hook=self.dec_hook)
+        self.aux_buffers: Sequence[bytestr] = ()
 
-    def decode(self, obj: Any):
-        return self.decoder.decode(obj)
+    def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
+        if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
+            # TODO - This check can become `isinstance(bufs, bytestr)`
+            # as of Python 3.10.
+            return self.decoder.decode(bufs)
 
+        self.aux_buffers = bufs
+        try:
+            return self.decoder.decode(bufs[0])
+        finally:
+            self.aux_buffers = ()
 
-def custom_enc_hook(obj: Any) -> Any:
-    if isinstance(obj, torch.Tensor):
-        # NOTE(rob): it is fastest to use numpy + pickle
-        # when serializing torch tensors.
-        # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        return msgpack.Ext(CUSTOM_TYPE_TENSOR, pickle.dumps(obj.numpy()))
+    def dec_hook(self, t: type, obj: Any) -> Any:
+        # Given native types in `obj`, convert to type `t`.
+        if isclass(t):
+            if issubclass(t, np.ndarray):
+                return self._decode_ndarray(obj)
+            if issubclass(t, torch.Tensor):
+                return self._decode_tensor(obj)
+            if issubclass(t, MultiModalKwargs):
+                if isinstance(obj, list):
+                    return MultiModalKwargs.from_items(
+                        self._decode_mm_items(obj))
+                return MultiModalKwargs({
+                    k: self._decode_nested_tensors(v)
+                    for k, v in obj.items()
+                })
+        return obj
 
-    return msgpack.Ext(CUSTOM_TYPE_PICKLE, pickle.dumps(obj))
+    def _decode_ndarray(self, arr: Any) -> np.ndarray:
+        dtype, shape, data = arr
+        # zero-copy decode. We assume the ndarray will not be kept around,
+        # as it now locks the whole received message buffer in memory.
+        buffer = self.aux_buffers[data] if isinstance(data, int) else data
+        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
+    def _decode_tensor(self, arr: Any) -> torch.Tensor:
+        dtype, shape, data = arr
+        # Copy from inline representation, to decouple the memory storage
+        # of the message from the original buffer. And also make Torch
+        # not complain about a readonly memoryview.
+        buffer = self.aux_buffers[data] if isinstance(data, int) \
+            else bytearray(data)
+        # Create numpy wrapper around the bytes
+        arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), ))
+        torch_dtype = getattr(torch, dtype)
+        assert isinstance(torch_dtype, torch.dtype)
+        # Convert back to proper shape & type
+        return torch.from_numpy(arr).view(torch_dtype).view(shape)
 
-def custom_ext_hook(code: int, data: memoryview) -> Any:
-    if code == CUSTOM_TYPE_TENSOR:
-        return torch.from_numpy(pickle.loads(data))
-    if code == CUSTOM_TYPE_PICKLE:
-        return pickle.loads(data)
+    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+        decoded_items = []
+        for item in obj:
+            elems = []
+            for v in item:
+                v["data"] = self._decode_nested_tensors(v["data"])
+                # Reconstruct the field processor using MultiModalFieldConfig
+                factory_meth_name, *field_args = v["field"]
+                factory_meth = getattr(MultiModalFieldConfig,
+                                       factory_meth_name)
+                v["field"] = factory_meth(None, *field_args).field
+                elems.append(MultiModalFieldElem(**v))
+            decoded_items.append(MultiModalKwargsItem.from_elems(elems))
+        return decoded_items
 
-    raise NotImplementedError(f"Extension type code {code} is not supported")
+    def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
+        if isinstance(obj, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return obj
+        if not isinstance(obj, list):
+            raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
+        if obj and isinstance(obj[0], str):
+            return self._decode_tensor(obj)
+        return [self._decode_nested_tensors(x) for x in obj]
+
+    def ext_hook(self, code: int, data: memoryview) -> Any:
+        if code == CUSTOM_TYPE_RAW_VIEW:
+            return data
+        if code == CUSTOM_TYPE_PICKLE:
+            return pickle.loads(data)
+        if code == CUSTOM_TYPE_CLOUDPICKLE:
+            return cloudpickle.loads(data)
+
+        raise NotImplementedError(
+            f"Extension type code {code} is not supported")
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
new file mode 100644
index 0000000000000..3efafa8f0b1f6
--- /dev/null
+++ b/vllm/v1/spec_decode/eagle.py
@@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader.loader import get_model_loader
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
+
+PADDING_SLOT_ID = -1
+
+
+class EagleProposer:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.num_speculative_tokens = (
+            vllm_config.speculative_config.num_speculative_tokens)
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.block_size = vllm_config.cache_config.block_size
+        # We need +1 here because the arange is used to set query_start_loc,
+        # which has one more element than batch_size.
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
+                                   1,
+                                   device=device,
+                                   dtype=torch.int32)
+
+    def propose(
+        self,
+        # [num_tokens]
+        target_token_ids: torch.Tensor,
+        # [num_tokens]
+        target_positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        target_hidden_states: torch.Tensor,
+        # [num_tokens]
+        target_slot_mapping: torch.Tensor,
+        # [batch_size]
+        next_token_ids: torch.Tensor,
+        # [batch_size + 1] starting with 0
+        cu_num_tokens: torch.Tensor,
+        # [batch_size, max_num_blocks_per_req]
+        block_table: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_tokens = target_token_ids.shape[0]
+        batch_size = next_token_ids.shape[0]
+        last_token_indices = cu_num_tokens[1:] - 1
+
+        input_ids = torch.empty_like(target_token_ids)
+        # Shift the input ids by one token.
+        # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
+        input_ids[:-1] = target_token_ids[1:]
+        # Replace the last token with the next token.
+        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
+        input_ids[last_token_indices] = next_token_ids
+
+        # FA requires seq_len to have dtype int32.
+        seq_lens = (target_positions[last_token_indices] + 1).int()
+
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        max_seq_len = seq_lens.max().item()
+        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_tokens,
+            max_query_len=max_num_tokens,
+            query_start_loc=cu_num_tokens,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=target_slot_mapping,
+            # TODO(woosuk): Support cascade attention.
+            use_cascade=False,
+            common_prefix_len=0,
+            cu_prefix_query_lens=None,
+            prefix_kv_lens=None,
+            suffix_kv_lens=None,
+        )
+
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                hidden_states=target_hidden_states,
+                positions=target_positions,
+            )
+        sample_hidden_states = hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
+        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
+            logits, sampling_metadata)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            # [batch_size, 1] and [batch_size, 1, vocab_size]
+            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+
+        # Generate the remaining draft tokens.
+        draft_token_ids_list = [draft_token_ids]
+        draft_probs_list = [draft_probs]
+
+        positions = target_positions[last_token_indices]
+        hidden_states = sample_hidden_states
+        attn_metadata.num_actual_tokens = batch_size
+        attn_metadata.max_query_len = 1
+        attn_metadata.query_start_loc = self.arange[:batch_size + 1]
+        for _ in range(self.num_speculative_tokens - 1):
+            # Update the inputs.
+            input_ids = draft_token_ids_list[-1]
+            positions += 1
+
+            # NOTE(woosuk): We should handle the case where the draft model
+            # generates tokens beyond the max model length. Since it is complex
+            # to remove such requests from the batch, we keep them in the batch
+            # but adjust the position ids and slot mappings to avoid the
+            # out-of-range access during the model execution. The draft tokens
+            # generated with this adjustment should be ignored.
+            exceeds_max_model_len = positions >= self.max_model_len
+            # Mask out the position ids that exceed the max model length.
+            # Otherwise, we may get out-of-range error in RoPE.
+            clamped_positions = torch.where(exceeds_max_model_len, 0,
+                                            positions)
+
+            # Increment the sequence lengths.
+            attn_metadata.max_seq_len += 1
+            attn_metadata.seq_lens += 1
+            # Consider max model length.
+            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+                                            self.max_model_len)
+            # For the requests that exceed the max model length, we set the
+            # sequence length to 1 to minimize their overheads in attention.
+            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
+            # Compute the slot mapping.
+            block_numbers = clamped_positions // self.block_size
+            block_ids = block_table.gather(dim=1,
+                                           index=block_numbers.view(-1, 1))
+            block_ids = block_ids.view(-1)
+            attn_metadata.slot_mapping = (block_ids * self.block_size +
+                                          clamped_positions % self.block_size)
+            # Mask out the slot mappings that exceed the max model length.
+            # Otherwise, the KV cache will be inadvertently updated with the
+            # padding tokens.
+            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+                                                    PADDING_SLOT_ID)
+
+            # Run the model.
+            with set_forward_context(attn_metadata, self.vllm_config):
+                hidden_states = self.model(
+                    input_ids=input_ids,
+                    hidden_states=hidden_states,
+                    positions=clamped_positions,
+                )
+            logits = self.model.compute_logits(hidden_states, None)
+            draft_token_ids, probs = compute_probs_and_sample_next_token(
+                logits, sampling_metadata)
+            draft_token_ids_list.append(draft_token_ids)
+            draft_probs_list.append(probs)
+
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
+        # [batch_size, num_speculative_tokens, vocab_size]
+        draft_probs = torch.stack(draft_probs_list, dim=1)
+        return draft_token_ids, draft_probs
+
+    @staticmethod
+    def prepare_inputs(
+        # [batch_size + 1]
+        cu_target_query_lens: torch.Tensor,
+        # [batch_size]
+        num_rejected_tokens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # cu_target_query_lens: [0, a, a + b, a + b + c]
+        # num_rejected_tokens: [n1, n2, n3]
+        # num_tokens_per_req: [a - n1, b - n2, c - n3]
+        # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+        # token_indices: [0, 1, ..., a - n1 - 1,
+        #                 a, a + 1, ..., a + b - n2 - 1,
+        #                 a + b, a + b + 1, ..., a + b + c - n3 - 1]
+
+        # [0, a, a + b, a + b + c] -> [a, b, c]
+        query_len_per_req = (cu_target_query_lens[1:] -
+                             cu_target_query_lens[:-1])
+        # [a, b, c] -> [a - n1, b - n2, c - n3]
+        num_tokens_per_req = query_len_per_req - num_rejected_tokens
+
+        cu_num_tokens = torch.empty_like(cu_target_query_lens)
+        torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
+        cu_num_tokens[0] = 0
+
+        # FIXME(woosuk): Avoid synchronization.
+        num_tokens = cu_num_tokens[-1].item()
+        token_indices = torch.empty(
+            num_tokens,
+            dtype=torch.int32,
+            device=cu_num_tokens.device,
+        )
+
+        batch_size = num_rejected_tokens.shape[0]
+        BLOCK_SIZE = 1024
+        prepare_input_kernel[(batch_size, )](
+            token_indices,
+            cu_target_query_lens,
+            cu_num_tokens,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return cu_num_tokens, token_indices
+
+    def load_model(self, target_model: nn.Module) -> None:
+        loader = get_model_loader(self.vllm_config.load_config)
+        target_layer_num = self.vllm_config.model_config.get_num_layers(
+            self.vllm_config.parallel_config)
+
+        draft_model_config = \
+            self.vllm_config.speculative_config.draft_model_config
+        # FIXME(lily): This does not handle with distributed inference.
+        target_device = self.vllm_config.device_config.device
+        # We need to set the vllm_config here to register attention
+        # layers in the forward context.
+        with set_default_torch_dtype(
+                draft_model_config.dtype), set_current_vllm_config(
+                    self.vllm_config):
+            self.model = EagleLlamaForCausalLM(
+                model_config=draft_model_config,
+                start_layer_id=target_layer_num).to(target_device)
+
+        self.model.load_weights(
+            loader.get_all_weights(
+                self.vllm_config.speculative_config.draft_model_config,
+                self.model))
+        self.model.lm_head = target_model.lm_head
+
+
+# FIXME(woosuk): The logic here is duplicated with the main sampling code.
+# We should refactor this to reuse the same sampling implementation.
+def compute_probs_and_sample_next_token(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if sampling_metadata.all_greedy:
+        # For greedy requests, draft_probs is not used in rejection sampling.
+        # Therefore, we can just return the logits.
+        probs = logits
+        next_token_ids = logits.argmax(dim=-1)
+        return next_token_ids, probs
+
+    is_greedy = sampling_metadata.temperature == -1
+    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    logits.div_(temperature.view(-1, 1))
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+
+    # NOTE(woosuk): Currently, we ignore most of the sampling parameters in
+    # generating the draft tokens. We only use the temperature. While this
+    # could degrade the acceptance rate, it does not affect the distribution
+    # of the generated tokens after rejection sampling.
+
+    # TODO(woosuk): Consider seeds.
+    q = torch.empty_like(probs)
+    q.exponential_()
+    # NOTE(woosuk): We shouldn't use `probs.div_(q)` because the draft_probs
+    # will be used later for rejection sampling.
+    next_token_ids = probs.div(q).argmax(dim=-1).view(-1)
+    if not sampling_metadata.all_random:
+        greedy_token_ids = probs.argmax(dim=-1)
+        next_token_ids = torch.where(
+            is_greedy,
+            greedy_token_ids,
+            next_token_ids,
+        )
+    return next_token_ids, probs
+
+
+@triton.jit
+def prepare_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            index_start + offset,
+            mask=offset < num_tokens,
+        )
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
new file mode 100644
index 0000000000000..cc453b74f7eb1
--- /dev/null
+++ b/vllm/v1/spec_decode/metrics.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class SpecDecodingStats:
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+
+    def take(self):
+        copied = SpecDecodingStats(self.num_draft_tokens,
+                                   self.num_accepted_tokens)
+        self.reset()
+        return copied
+
+    def reset(self):
+        self.num_draft_tokens = 0
+        self.num_accepted_tokens = 0
+
+    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+
+
+class SpecDecodingMetrics:
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+
+    def log(self, log_fn=logger.info):
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
+
+        log_fn(
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens",
+            draft_acceptance_rate,
+            num_accepted_tokens,
+            num_draft_tokens,
+        )
+        self.reset()
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 0bef349e99e28..704153d43a2b4 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -4,15 +4,30 @@ from typing import Optional
 import numpy as np
 from numba import jit
 
+from vllm.config import VllmConfig
+
 
 class NgramProposer:
 
+    def __init__(self, vllm_config: VllmConfig):
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
+
     def propose(
         self,
         context_token_ids: np.ndarray,
-        min_n: int,
-        max_n: int,
-        k: int,
     ) -> Optional[np.ndarray]:
         """Proposes the next sequence of tokens based on n-gram pattern 
         matching in the context. The function finds matches of the last n 
@@ -22,17 +37,12 @@ class NgramProposer:
         Args:
             context_token_ids: Numpy array of token IDs representing the 
                                context sequence.
-            min_n: Minimum length of the n-gram to match.
-            max_n: Maximum length of the n-gram to match.
-            k: Number of tokens follow the match. If there are less 
-               than k tokens follow the match, we will return 
-               the maximum amount of tokens until the end.
-        
+
         Returns:
             np.ndarray: The sequence of tokens that followed 
                         the matched n-gram in the context.
             None: If no matching n-gram pattern is found.
-        
+
         Example:
             If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
             k = 4:
@@ -43,13 +53,22 @@ class NgramProposer:
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+
         # TODO(woosuk): Optimize this.
-        for n in range(max_n, min_n - 1, -1):
+        for n in range(self.max_n, self.min_n - 1, -1):
             result = _find_subarray_kmp(context_token_ids, n, k)
             if result is not None:
                 return result
         return None
 
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+
 
 @jit(nopython=True)
 def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 6c6a8a7bce3ec..218af43deb677 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import multiprocessing
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
@@ -57,13 +57,13 @@ class StructuredOutputManager:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
 
-        grammar: Future[StructuredOutputGrammar] = self.executor.submit(
-            self._async_create_grammar, request, self.backend)
+        grammar = self.executor.submit(self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
     def _async_create_grammar(
-            self, request: Request,
-            backend: StructuredOutputBackend) -> StructuredOutputGrammar:
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
 
         # Note that the request was validated in the engine core client,
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 1e274ad0ae623..9150a28570bdd 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -41,14 +41,18 @@ class GuidanceBackend(StructuredOutputBackend):
         tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
+        self.disable_any_whitespace = (
+            "disable-any-whitespace"
+            in vllm_config.decoding_config.guided_decoding_backend)
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(
+            tokenizer, self.vocab_size)
 
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec)
+            request_type, grammar_spec, self.disable_any_whitespace)
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -126,17 +130,19 @@ class GuidanceGrammar(StructuredOutputGrammar):
 
 
 def serialize_guidance_grammar(request_type: StructuredOutputOptions,
-                               grammar_spec: str) -> str:
+                               grammar_spec: str,
+                               disable_any_whitespace: bool = False) -> str:
     if request_type == StructuredOutputOptions.JSON:
-        # TODO: make whitespace_flexible configurable
         return llguidance.LLMatcher.grammar_from_json_schema(
-            grammar_spec, defaults={
-                "whitespace_flexible": True,
+            grammar_spec,
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     elif request_type == StructuredOutputOptions.JSON_OBJECT:
         return llguidance.LLMatcher.grammar_from_json_schema(
-            '{"type": "object"}', defaults={
-                "whitespace_flexible": True,
+            '{"type": "object"}',
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     else:
         if request_type == StructuredOutputOptions.REGEX:
@@ -158,7 +164,6 @@ def validate_guidance_grammar(
         tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
     tp, grm = get_structured_output_key(sampling_params)
     guidance_grm = serialize_guidance_grammar(tp, grm)
-    err = llguidance.LLMatcher.validate_grammar(guidance_grm,
-                                                tokenizer=tokenizer)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
     if err:
         raise ValueError(f"Grammar error: {err}")
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 9bfb644c58094..c9839bd7ddee0 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,18 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 
+import vllm.envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar,
                                                      StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (choice_as_grammar,
+                                             convert_lark_to_ebnf,
+                                             grammar_is_likely_lark)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -42,12 +48,15 @@ class XgrammarBackend(StructuredOutputBackend):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
             try:
-                encoded_vocab = [
-                    token for token, _ in sorted(
-                        tokenizer.get_vocab().items(),
-                        key=lambda x: x[1],
-                    )
-                ]
+                if tokenizer.is_tekken:
+                    encoded_vocab = tokenizer._vocab
+                else:
+                    encoded_vocab = [
+                        token for token, _ in sorted(
+                            tokenizer.get_vocab().items(),
+                            key=lambda x: x[1],
+                        )
+                    ]
                 stop_token_ids = None
                 if hasattr(
                         tokenizer,
@@ -62,7 +71,8 @@ class XgrammarBackend(StructuredOutputBackend):
             tokenizer_info = xgr.TokenizerInfo(  # type: ignore
                 encoded_vocab=encoded_vocab,
                 # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                vocab_type=xgr.VocabType.BYTE_FALLBACK,
+                vocab_type=xgr.VocabType.RAW
+                if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
                 vocab_size=self.vocab_size,
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,
@@ -72,7 +82,12 @@ class XgrammarBackend(StructuredOutputBackend):
                 tokenizer,
                 vocab_size=self.vocab_size,
             )
-        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+        self.compiler = xgr.GrammarCompiler(
+            tokenizer_info,
+            max_threads=8,
+            cache_enabled=True,
+            cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
+        )
 
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
@@ -80,7 +95,9 @@ class XgrammarBackend(StructuredOutputBackend):
             ctx = self.compiler.compile_json_schema(
                 grammar_spec, any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
-            ctx = self.compiler.compile_builtin_json_grammar()
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}',
+                any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.GRAMMAR:
             ctx = self.compiler.compile_grammar(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:
@@ -144,3 +161,112 @@ class XgrammarGrammar(StructuredOutputGrammar):
     def reset(self):
         self.num_processed_tokens = 0
         self.matcher.reset()
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj
+                for key in ("minimum", "maximum", "exclusiveMinimum",
+                            "exclusiveMaximum", "multipleOf")):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj
+                for key in ("uniqueItems", "contains", "minContains",
+                            "maxContains", "minItems", "maxItems")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and "format" in obj:
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 694e46f763f02..f33f4972e1032 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -2,72 +2,7 @@
 
 from __future__ import annotations
 
-import json
 import re
-from typing import TYPE_CHECKING, Any
-
-from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-
-if TYPE_CHECKING:
-    import xgrammar as xgr
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
-
-
-def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict[str, Any]) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Check for enum restrictions
-        if "enum" in obj:
-            return True
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj
-                for key in ("minimum", "maximum", "exclusiveMinimum",
-                            "exclusiveMaximum", "multipleOf")):
-            return True
-
-        # Check for array unsupported keywords
-        if obj.get("type") == "array" and any(
-                key in obj
-                for key in ("uniqueItems", "contains", "minContains",
-                            "maxContains", "minItems", "maxItems")):
-            return True
-
-        # Unsupported keywords for strings
-        if obj.get("type") == "string" and any(
-                key in obj for key in ("minLength", "maxLength", "format")):
-            return True
-
-        # Unsupported keywords for objects
-        if obj.get("type") == "object" and any(
-                key in obj for key in ("minProperties", "maxProperties",
-                                       "propertyNames", "patternProperties")):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
@@ -237,63 +172,3 @@ def choice_as_grammar(choice: list[str]) -> str:
     escaped_choices = (escape_ebnf_string(c) for c in choice)
     grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
     return grammar
-
-
-def validate_structured_output_request_xgrammar(
-        sampling_params: SamplingParams) -> None:
-    """Validate that the request is supported by structured output.
-
-    Raises ValueError if the request is not supported.
-    """
-    if sampling_params.guided_decoding is None:
-        return
-
-    gd_params = sampling_params.guided_decoding
-
-    if gd_params.regex:
-        try:
-            xgr.Grammar.from_regex(gd_params.regex)
-        except Exception as err:
-            raise ValueError("Failed to transform regex into a grammar: "
-                             f"{err}") from err
-
-    if gd_params.choice:
-        choice_grammar = choice_as_grammar(gd_params.choice)
-        try:
-            xgr.Grammar.from_ebnf(choice_grammar)
-        except Exception as err:
-            raise ValueError("Failed to transform choices into a grammar: "
-                             "{err}") from err
-        gd_params.choice = None
-        gd_params.grammar = choice_grammar
-        return
-
-    if gd_params.json:
-        if isinstance(gd_params.json, str):
-            try:
-                schema = json.loads(gd_params.json)
-            except json.JSONDecodeError as e:
-                raise ValueError("Invalid JSON grammar specification.") from e
-        else:
-            schema = gd_params.json
-
-        if has_xgrammar_unsupported_json_features(schema):
-            raise ValueError("The provided JSON schema contains features not "
-                             "supported by xgrammar.")
-        return
-
-    if gd_params.grammar:
-        if grammar_is_likely_lark(gd_params.grammar):
-            # xgrammar supports EBNF grammars only
-            try:
-                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to convert the grammar from Lark to EBNF. ") from e
-
-        # Test parsing EBNF grammar, possibly already converted from Lark
-        try:
-            # parse the grammar, but we aren't compiling it.
-            xgr.Grammar.from_ebnf(gd_params.grammar)
-        except Exception as e:
-            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6c01ed3de52d7..9c0fa2d0773d4 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
 import os
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
+from multiprocessing import Process
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
                     Union, overload)
 
@@ -105,25 +105,22 @@ class BackgroundProcHandle:
         process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
 
-        assert ("ready_pipe" not in process_kwargs
-                and "input_path" not in process_kwargs
+        assert ("input_path" not in process_kwargs
                 and "output_path" not in process_kwargs)
-        process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
         # Run busy loop in background process.
-        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc: Process = context.Process(target=target_fn,
+                                             kwargs=process_kwargs,
+                                             name=process_name)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                            input_path, output_path)
         self.proc.start()
 
-        # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{process_name} initialization failed. "
-                               "See root cause above.")
+    def fileno(self):
+        return self.proc.sentinel
 
     def shutdown(self):
         self._finalizer()
@@ -131,14 +128,14 @@ class BackgroundProcHandle:
 
 # Note(rob): shutdown function cannot be a bound method,
 # else the gc cannot collect the object.
-def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
+def shutdown(proc: Process, input_path: str, output_path: str):
     # Shutdown the process.
     if proc.is_alive():
         proc.terminate()
         proc.join(5)
 
-        if proc.is_alive():
-            kill_process_tree(proc.pid)
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
 
     # Remove zmq ipc socket files.
     ipc_sockets = [output_path, input_path]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 01a5cb5548bb4..a64cb97e0123f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -2,13 +2,13 @@
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, cast
+from typing import Optional, cast
 
 import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
@@ -18,9 +18,6 @@ from vllm.v1.worker.block_table import BlockTable
 
 _SAMPLING_EPS = 1e-5
 
-if TYPE_CHECKING:
-    from vllm.multimodal.inputs import PlaceholderRange
-
 
 @dataclass
 class CachedRequestState:
@@ -29,7 +26,7 @@ class CachedRequestState:
     prompt_token_ids: list[int]
     prompt: Optional[str]
     mm_inputs: list[MultiModalKwargs]
-    mm_positions: list["PlaceholderRange"]
+    mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
@@ -42,9 +39,18 @@ class CachedRequestState:
 
     lora_request: Optional[LoRARequest] = None
 
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
     @property
     def num_tokens(self) -> int:
-        return len(self.prompt_token_ids) + len(self.output_token_ids)
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
 
 
 class InputBatch:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a85009f1a36a4..bdf0d0f72289e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -13,28 +13,32 @@ import torch.nn as nn
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group)
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv,
-                        is_pin_memory_available)
+                        GiB_bytes, LayerBlockType, LazyLoader, cdiv,
+                        check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.utils import is_spec_decode_supported
@@ -42,6 +46,9 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
+
 if TYPE_CHECKING:
     import xgrammar as xgr
 
@@ -70,6 +77,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
+
         model_config = self.model_config
         cache_config = self.cache_config
         scheduler_config = self.scheduler_config
@@ -106,6 +117,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.attention_chunk_size = model_config.attention_chunk_size
 
         self.attn_backend = get_attn_backend(
             self.head_size,
@@ -130,13 +142,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
@@ -151,18 +163,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-            assert self.speculative_config.method == "ngram", \
-                    "Currently, only ngram spec decode is supported in V1."
             if get_pp_group().is_last_rank:
-                self.drafter = NgramProposer()
-                # Trigger Numba JIT compilation for N-gram proposer.
-                # This usually takes less than 1 second.
-                self.drafter.propose(
-                    np.zeros(1024, dtype=np.int32),
-                    self.speculative_config.prompt_lookup_min,
-                    self.speculative_config.prompt_lookup_max,
-                    self.speculative_config.num_speculative_tokens,
-                )
+                if self.speculative_config.method == "ngram":
+                    self.drafter = NgramProposer(self.vllm_config)
+                elif self.speculative_config.method == "eagle":
+                    self.drafter = EagleProposer(self.vllm_config,
+                                                 self.device)  # type: ignore
+                else:
+                    raise ValueError("Unknown speculative decoding method: "
+                                     f"{self.speculative_config.method}")
                 self.rejection_sampler = RejectionSampler()
 
         # Request states.
@@ -223,6 +232,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 device="cpu",
                 pin_memory=self.pin_memory)
 
+        # Only relevant for models using ALiBi (e.g, MPT)
+        self.use_alibi = check_use_alibi(model_config)
+
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
@@ -343,6 +355,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 image_grid_thw = []
                 video_grid_thw = []
                 second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
                 for mm_input in self.requests[req_id].mm_inputs:
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.extend(
@@ -353,6 +367,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     if mm_input.get("second_per_grid_ts") is not None:
                         second_per_grid_ts.extend(
                             mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.extend(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
 
                 hf_config = self.model_config.hf_config
 
@@ -364,6 +383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
                         second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
             req_ids_to_add.append(req_id)
@@ -433,7 +454,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
             if removed_req_indices:
@@ -448,7 +469,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-        if batch_changed:
+        # Some attention backends (namely MLA) may want to separate requests
+        # based on if the attention computation will be compute-bound or
+        # memory-bound. This gives them a hook to do that.
+        batch_reordered = self.attn_metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
+
+        if batch_changed or batch_reordered:
             self.input_batch.refresh_sampling_metadata()
 
     def _prepare_inputs(
@@ -461,27 +488,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
 
-        # Some attention backends (namely MLA) may want to separate requests
-        # based on if the attention computation will be compute-bound or
-        # memory-bound. This gives them a hook to do that.
-        modified_batch = self.attn_metadata_builder.reorder_batch(
-            self.input_batch, scheduler_output)
-        if modified_batch:
-            self.input_batch.refresh_sampling_metadata()
-
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit(num_reqs)
 
         # Get the number of scheduled tokens for each request.
-        # TODO: The Python loop can be slow. Optimize.
-        num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
-        max_num_scheduled_tokens = 0
-        for i, req_id in enumerate(self.input_batch.req_ids):
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            num_scheduled_tokens[i] = num_tokens
-            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
-                                           num_tokens)
+        req_ids = self.input_batch.req_ids
+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+        num_scheduled_tokens = np.array(tokens, dtype=np.int32)
+        max_num_scheduled_tokens = max(tokens)
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
@@ -534,9 +549,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # because M (max_model_len) is not necessarily divisible by block_size.
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions_np // self.block_size)
-        # NOTE(woosuk): We use torch.index_select instead of np.take here
-        # because torch.index_select is much faster than np.take for large
-        # tensors.
         block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
         block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
         block_offsets = positions_np % self.block_size
@@ -671,7 +683,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # use two kernels for cascade attention. Let's imagine:
         # Request 3's input query: [D]
         # Request 3's kv cache: [A, B, C, D]
-        # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+        # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
         # If we use [A, B, C, D] as the common prefix for Request 1-3,
         # then Request 3 will be processed only by the first kernel,
         # and the second kernel will get an empty input. While this is not
@@ -684,12 +696,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (common_prefix_len // self.block_size *
                              self.block_size)
-        use_cascade = self.attn_backend.use_cascade_attention(
+        use_cascade = self.attn_metadata_builder.use_cascade_attention(
             common_prefix_len=common_prefix_len,
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,
             num_kv_heads=self.num_kv_heads,
-            use_alibi=False,  # FIXME
+            use_alibi=self.use_alibi,
             use_sliding_window=self.window_size is not None,
             num_sms=self.num_sms,
         )
@@ -822,19 +834,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         )
         return metadata
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -861,20 +875,32 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -882,8 +908,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -905,8 +931,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -964,18 +998,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
+        # Update KVConnector with the KVConnector metadata forward().
+        if has_kv_transfer_group():
+            get_kv_transfer_group().bind_connector_metadata(
+                scheduler_output.kv_connector_metadata)
+
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
-            # Return empty ModelRunnerOuptut if there's no work to do.
+            # Return empty ModelRunnerOutput if there's no work to do.
             return EMPTY_MODEL_RUNNER_OUTPUT
 
-        if self.is_multimodal_model:
-            # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
-        else:
-            encoder_outputs = []
-
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
             self._prepare_inputs(scheduler_output))
@@ -991,14 +1023,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
+        # _prepare_inputs may reorder the batch, so we must gather multi
+        # modal outputs after that to ensure the correct order
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+        else:
+            mm_embeds = []
+
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, encoder_outputs)
+                    input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
@@ -1085,8 +1126,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
-        for i, generator in self.input_batch.generators.items():
-            req_id = self.input_batch.req_ids[i]
+        discard_sampled_tokens_req_indices = []
+        for i, req_id in enumerate(self.input_batch.req_ids):
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -1094,7 +1135,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 # Ignore the sampled token for partial prefills.
                 # Rewind the generator state as if the token was not sampled.
                 # This relies on cuda-specific torch-internal impl details
-                generator.set_offset(generator.get_offset() - 4)
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    generator.set_offset(generator.get_offset() - 4)
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
 
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
@@ -1117,13 +1163,90 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids, self.input_batch.vocab_size)
+                sampled_token_ids,
+                self.input_batch.vocab_size,
+            )
+        # Mask out the sampled tokens that should not be sampled.
+        for i in discard_sampled_tokens_req_indices:
+            valid_sampled_token_ids[i].clear()
 
         if not self.use_spec_decode:
+            # Speculative decoding is not enabled.
             spec_token_ids = None
-        else:
+        elif self.speculative_config.method == "ngram":
+            assert isinstance(self.drafter, NgramProposer)
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids, sampling_metadata)
+        elif self.speculative_config.method == "eagle":
+            assert isinstance(self.drafter, EagleProposer)
+            # TODO(woosuk): Refactor the loop.
+            next_token_ids: list[int] = []
+            for i, token_ids in enumerate(valid_sampled_token_ids):
+                if token_ids:
+                    # Common case.
+                    next_token_id = token_ids[-1]
+                else:
+                    # Partial prefill (rare case).
+                    # Get the next token id from the request state.
+                    req_id = self.input_batch.req_ids[i]
+                    req_state = self.requests[req_id]
+                    seq_len = (req_state.num_computed_tokens +
+                               scheduler_output.num_scheduled_tokens[req_id])
+                    next_token_id = req_state.get_token_id(seq_len)
+                next_token_ids.append(next_token_id)
+            next_token_ids = torch.tensor(next_token_ids,
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+            if spec_decode_metadata is None:
+                # input_ids can be None for multimodal models.
+                # We need to slice token_ids, positions, and hidden_states
+                # because the eagle head does not use cuda graph and should
+                # not include padding.
+                target_token_ids = self.input_ids[:num_scheduled_tokens]
+                target_positions = positions[:num_scheduled_tokens]
+                target_hidden_states = hidden_states[:num_scheduled_tokens]
+                target_slot_mapping = attn_metadata.slot_mapping
+                cu_num_tokens = attn_metadata.query_start_loc
+            else:
+                # TODO(woosuk): Refactor this.
+                num_draft_tokens = spec_decode_metadata.num_draft_tokens
+                num_rejected_tokens = [
+                    n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
+                    for i, n in enumerate(num_draft_tokens)
+                ]
+                num_rejected_tokens = torch.tensor(
+                    num_rejected_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                cu_num_tokens, token_indices = self.drafter.prepare_inputs(
+                    attn_metadata.query_start_loc,
+                    num_rejected_tokens,
+                )
+                target_token_ids = self.input_ids[token_indices]
+                target_positions = positions[token_indices]
+                target_hidden_states = hidden_states[token_indices]
+                target_slot_mapping = attn_metadata.slot_mapping[token_indices]
+
+            draft_token_ids, draft_probs = self.drafter.propose(
+                target_token_ids=target_token_ids,
+                target_positions=target_positions,
+                target_hidden_states=target_hidden_states,
+                target_slot_mapping=target_slot_mapping,
+                next_token_ids=next_token_ids,
+                cu_num_tokens=cu_num_tokens,
+                block_table=attn_metadata.block_table,
+                sampling_metadata=sampling_metadata,
+            )
+            spec_token_ids = draft_token_ids.tolist()
+            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
+            # in the next step.
+            del draft_probs
+
+        # Clear KVConnector state after all KVs are generated.
+        if has_kv_transfer_group():
+            get_kv_transfer_group().clear_connector_metadata()
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1148,7 +1271,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 draft_token_ids.append([])
                 continue
 
-            # Skip requests that require top-p, top-k, etc.
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
             req_id = self.input_batch.req_ids[i]
             if not is_spec_decode_supported(req_id, self.input_batch):
                 draft_token_ids.append([])
@@ -1157,13 +1281,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Add sampled_token_ids to token_ids_cpu.
             start_idx = self.input_batch.num_tokens_no_spec[i]
             end_idx = start_idx + num_sampled_ids
+            if end_idx >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
-                self.input_batch.token_ids_cpu[i, :end_idx],
-                self.speculative_config.prompt_lookup_min,
-                self.speculative_config.prompt_lookup_max,
-                self.speculative_config.num_speculative_tokens,
-            )
+                self.input_batch.token_ids_cpu[i, :end_idx])
             if drafter_output is None or len(drafter_output) == 0:
                 draft_token_ids.append([])
             else:
@@ -1181,10 +1306,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                                   self.scheduler_config,
                                                   self.lora_config,
                                                   self.device)
+            if hasattr(self, "drafter"):
+                logger.info("Loading drafter model...")
+                self.drafter.load_model(self.model)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
 
     def _get_prompt_logprobs_dict(
@@ -1425,9 +1553,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = (
-                MULTIMODAL_REGISTRY.
-                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            max_tokens_by_modality_dict = self.mm_registry \
+                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
@@ -1459,24 +1586,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_budget, max_num_mm_items, dummy_data_modality)
 
             # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
                 seq_len=self.max_num_tokens,
-                mm_registry=self.mm_registry,
-            )
-            dummy_mm_data = dummy_request_data.multi_modal_data
-            if not isinstance(dummy_mm_data, MultiModalKwargs):
-                # TODO: Delete this check once input mapper is fully removed.
-                raise RuntimeError(
-                    "Legacy input mapper is not supported in V1")
-
-            # Dummy data definition may contain multiple multimodal items
-            # (e.g, multiple images) for a single request, therefore here we
-            # always replicate first item by max_num_mm_items times since in V1
-            # they are scheduled to be processed separately.
-            dummy_mm_item = dummy_mm_data.get_item(
-                modality=dummy_data_modality, item_index=0)
-            dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+                mm_counts={
+                    dummy_data_modality: 1
+                },
+            ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)
@@ -1486,12 +1602,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Run multimodal encoder.
             dummy_encoder_outputs = self.model.get_multimodal_embeddings(
                 **batched_dummy_mm_inputs)
-            assert len(dummy_encoder_outputs) == max_num_mm_items, (
-                "Expected dimension 0 of encoder outputs to match the number "
-                f"of multimodal data items: {max_num_mm_items}, got "
-                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-                "due to the 'get_multimodal_embeddings' method of the model "
-                "not implemented correctly.")
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )
 
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
@@ -1562,7 +1677,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 # different GPUs, and `kv_cache_config.num_blocks` is set to
                 # the min of all `num_blocks`. Verify it here.
                 assert num_blocks >= kv_cache_config.num_blocks
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1601,12 +1716,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # cross-attention
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype,
-                    use_mla=use_mla)
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=use_mla)
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 51b9f56739665..424c73e3ab7fa 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -9,11 +9,12 @@ import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -53,6 +54,9 @@ class Worker(WorkerBase):
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -72,6 +76,15 @@ class Worker(WorkerBase):
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -83,9 +96,17 @@ class Worker(WorkerBase):
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags)
+
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
 
     def init_device(self):
         if self.device_config.device.type == "cuda":
@@ -110,7 +131,7 @@ class Worker(WorkerBase):
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
+        init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
@@ -269,14 +290,29 @@ class Worker(WorkerBase):
         # worker will always be healthy as long as it's running.
         return
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from vllm.model_executor.model_loader.loader import ShardedStateLoader
+        ShardedStateLoader.save_model(
+            self.model_runner.model,
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
 
 def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
+    vllm_config: VllmConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
@@ -285,6 +321,8 @@ def init_worker_distributed_environment(
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    ensure_kv_transfer_initialized(vllm_config)
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index edf859f0b9463..5d94f675f92e8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import bisect
+import gc
 import time
 from typing import TYPE_CHECKING, Optional, cast
 from unittest.mock import patch
@@ -15,29 +16,32 @@ import torch_xla.runtime as xr
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+                                    PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
-from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput, SamplerOutput)
+                             ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -51,6 +55,41 @@ INVALID_TOKEN_ID = -1
 MIN_NUM_SEQS = 8
 
 
+#########################################################
+# Ways to avoid recompilation
+#########################################################
+#
+# The model executor has two primary components:
+# 1. preparing the model and sampler inputs
+# 2. executing the model and sampler.
+# The core idea is to avoid any TPU computation during input preparation. For
+# better compilation tracking and increased flexibility, the model execution and
+# sampler are divided into several distinct components.
+#
+# Below are the detailed steps:
+#
+# Step 1
+# It is recommended to avoid TPU operations when preparing the model and sampler
+# inputs. CPU tensors can be prepared and transferred to the XLA device using
+# cpu_tensor.to(xla_device), which only triggers CPU to TPU transfers and avoids
+# compilation.
+#
+# Step 2
+# The TPU execution should be decomposed into subgraphs (4 at the moment):
+# 1. the main model
+# 2. selecting hidden states for each request
+# 3. sampler
+# 4. encoder.
+# Each subgraph should be decorated in a torch.compile. This is used to make
+# sure that we have the same subgraph topology in both dummy_run and
+# xecute_model. The results from these subgraphs should either be passed to
+# other subgraphs, or transferred from TPU to CPU using xla_tensor.cpu() for
+# subsequent processing on the CPU.
+#
+# Step 3
+# The dummy_run should be comprehensive, ensuring all potential input shapes and
+# branch predictions are included as subgraph inputs to facilitate
+# pre-compilation.
 class TPUModelRunner:
 
     def __init__(
@@ -76,19 +115,31 @@ class TPUModelRunner:
         parallel_config = self.parallel_config
         self.device = device
         self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
-        if self.check_recompilation:
-            self.num_xla_graphs = xr.get_num_cached_compilation_graph()
+
         self.enforce_eager = model_config.enforce_eager
+
+        self.num_xla_graphs = 0
+        self._update_num_xla_graphs("init")
+
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = scheduler_config.max_num_batched_tokens
-        self.max_num_reqs = scheduler_config.max_num_seqs
+        # InputBatch needs to work with sampling tensors greater than padding
+        # to avoid dynamic shapes. Also, avoid suboptimal alignment.
+        self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
+        self.num_tokens_paddings = _get_token_paddings(
+            min_token_size=16,
+            max_token_size=scheduler_config.max_num_batched_tokens,
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
+        # padded max value to pre-allocate data structures and pre-compile.
+        self.max_num_tokens = self.num_tokens_paddings[-1]
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -98,9 +149,9 @@ class TPUModelRunner:
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
@@ -109,6 +160,7 @@ class TPUModelRunner:
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
@@ -128,7 +180,7 @@ class TPUModelRunner:
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=model_config.get_vocab_size(),
+            vocab_size=self.vocab_size,
         )
 
         # Cached torch/numpy tensor
@@ -148,11 +200,8 @@ class TPUModelRunner:
                                             dtype=torch.int64,
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
-
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_reqs, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -171,10 +220,78 @@ class TPUModelRunner:
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
-        self.num_tokens_paddings = _get_paddings(
-            min_token_size=16,
-            max_token_size=self.max_num_tokens,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        self.num_reqs_paddings = _get_req_paddings(
+            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
+
+        # tensors for structured decoding
+        self.grammar_bitmask_cpu = torch.zeros(
+            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.require_structured_out_cpu = torch.zeros(
+            (self.max_num_reqs, 1),
+            dtype=torch.bool,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.structured_decode_arange = torch.arange(
+            0, 32, device="cpu", pin_memory=self.pin_memory)
+
+        # Get maximum number of mm items per modality (batch size).
+        self.max_num_mm_items_by_modality = dict()
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+            max_tokens_by_modality_dict = (
+                MULTIMODAL_REGISTRY.
+                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            for modality, max_tokens in max_tokens_by_modality_dict.items():
+                # Check how many items of this modality can be supported by
+                # the encoder budget.
+                encoder_budget = min(self.max_num_encoder_input_tokens,
+                                     self.encoder_cache_size)
+
+                max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                       max_tokens)
+
+                # Check how many items of this modality can be supported by
+                # the decoder budget.
+                max_mm_items_per_req = self.mm_registry.\
+                    get_mm_limits_per_prompt(self.model_config)[modality]
+
+                # NOTE: We do not consider max_num_batched_tokens on purpose
+                # because the multimodal embeddings can be generated in advance
+                # and chunked prefilled.
+                max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                    max_mm_items_per_req
+
+                max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                       max_num_mm_items_decoder_budget)
+                self.max_num_mm_items_by_modality[modality] = max_num_mm_items
+
+    def _update_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        total_cached_graphs = xr.get_num_cached_compilation_graph()
+        new_compiled_graphs = total_cached_graphs - self.num_xla_graphs
+        if new_compiled_graphs == 0:
+            return
+
+        logger.info("Add new %d compiled XLA graphs due to %s",
+                    new_compiled_graphs, case_str)
+        self.num_xla_graphs += new_compiled_graphs
+
+    def _verify_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        curr_cached_graph = xr.get_num_cached_compilation_graph()
+        assert self.num_xla_graphs == curr_cached_graph, (
+            "Recompilation after warm up is detected during {}."
+            " num_xla_graphs = {} curr_cached_graph = {}".format(
+                case_str, self.num_xla_graphs, curr_cached_graph))
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         """Update the cached states and the persistent batch with the scheduler
@@ -234,11 +351,6 @@ class TPUModelRunner:
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
             sampling_params = new_req_data.sampling_params
-            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
-                generator = torch.Generator(device=self.device)
-                generator.manual_seed(sampling_params.seed)
-            else:
-                generator = None
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
@@ -247,7 +359,7 @@ class TPUModelRunner:
                 mm_inputs=new_req_data.mm_inputs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
-                generator=generator,
+                generator=None,
                 block_ids=new_req_data.block_ids,
                 num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
@@ -321,17 +433,25 @@ class TPUModelRunner:
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
-            # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention, MLA.
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=attn_module.dtype,
-                    use_mla=False,
-                )
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=False,
+                    )
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        use_mla=False,
+                    )
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
@@ -469,21 +589,48 @@ class TPUModelRunner:
         # Padded to avoid recompiling when `num_reqs` varies.
         logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
-        return attn_metadata, logits_indices
+        return attn_metadata, logits_indices, padded_num_reqs
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _scatter_placeholders(
+        self,
+        embeds: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return embeds
+
+        placeholders = embeds.new_full(
+            (is_embed.shape[0], embeds.shape[-1]),
+            fill_value=torch.nan,
+        )
+        placeholders[is_embed] = embeds
+        return placeholders
+
+    def _gather_placeholders(
+        self,
+        placeholders: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return placeholders
+
+        return placeholders[is_embed]
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -507,32 +654,55 @@ class TPUModelRunner:
             # 2. A list or tuple (length: num_items) of tensors, each of shape
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
+            xm.mark_step()
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
+            xm.mark_step()
 
-            for output in curr_group_outputs:
-                encoder_outputs.append(output)
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
+            if isinstance(curr_group_outputs, torch.Tensor):
+                encoder_outputs.append(curr_group_outputs)
+            else:
+                assert isinstance(curr_group_outputs, (list, tuple))
+                for output in curr_group_outputs:
+                    encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        # NOTE (NickLucche) here we diverge from logic in other runners, as we
+        # assume to only have whole mm items to process. Hence we avoid the
+        # intrinsic dynamism that `scatter_mm_placeholders` introduces.
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
+            assert pos_info.is_embed is None, "Expected all positions to be"\
+                " contiguous and embeddings."
             self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
+            # TODO unroll loop and assume/enforce --disable_chunked_mm_input
+            # NOTE (NickLucche) here we diverge from logic in other runners, as
+            # we assume to only have whole mm items to process. Hence we avoid
+            # the intrinsic dynamism that `gather_mm_placeholders` introduces.
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -546,16 +716,32 @@ class TPUModelRunner:
                     # in the decoder's KV cache.
                     continue
 
-                start_idx = max(num_computed_tokens - start_pos, 0)
-                end_idx = min(
-                    num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens)
-                assert start_idx < end_idx
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
+                assert pos_info.is_embed is None, "Expected all positions to"\
+                " be contiguous and embeddings."
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+                mm_embeds.append(encoder_output)
+        return mm_embeds
+
+    def _get_model_inputs(self, input_ids: torch.Tensor,
+                          mm_embeds: list[torch.Tensor]):
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            if mm_embeds:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, mm_embeds)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            return None, inputs_embeds
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            return input_ids, None
 
     @torch.no_grad()
     def execute_model(
@@ -566,57 +752,50 @@ class TPUModelRunner:
         # Update cached state
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
-            # Return empty ModelRunnerOuptut if there's no work to do.
+            # Return empty ModelRunnerOutput if there's no work to do.
             return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
-
+            mm_embeds = []
+        xm.mark_step()
         # Prepare inputs
-        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
-        if self.is_multimodal_model:
-            # NOTE(woosuk): To unify token ids and soft tokens (vision
-            # embeddings), we always use embeddings (rather than token ids)
-            # as input to the multimodal model, even when the input is text.
-            if encoder_outputs:
-                inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, encoder_outputs)
-            else:
-                inputs_embeds = self.model.get_input_embeddings(self.input_ids)
-            input_ids = None
-        else:
-            # For text-only models, we use token ids as input.
-            # While it is possible to use embeddings as input just like the
-            # multimodal models, it is not desirable for performance since
-            # then the embedding layer is not included in the CUDA graph.
-            input_ids = self.input_ids
-            inputs_embeds = None
+        attn_metadata, logits_indices, padded_num_reqs = self._prepare_inputs(
+            scheduler_output)
+        input_ids, inputs_embeds = self._get_model_inputs(
+            self.input_ids, mm_embeds)
+        xm.mark_step()
         num_reqs = self.input_batch.num_reqs
-        # NOTE (NickLucche) here we sync with TPU: sampling params tensors
-        # are copied to device in chunks of pre-compiled padded shape to
-        # avoid recompilations.
-        tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
-            from_input_batch(self.input_batch, logits_indices)
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
                 input_ids=input_ids,
                 positions=self.position_ids,
-                kv_caches=self.kv_caches,
                 inputs_embeds=inputs_embeds,
             )
-        selected_token_ids = self.model.sample_from_hidden(
-            hidden_states, tpu_sampling_metadata)
+        hidden_states = self.select_hidden_states(hidden_states,
+                                                  logits_indices)
+        logits = self.compute_logits(hidden_states)
+        tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
+            from_input_batch(self.input_batch, padded_num_reqs, self.device)
+        if scheduler_output.grammar_bitmask is not None:
+            require_struct_decoding, grammar_bitmask_padded, arange = \
+                self.prepare_structured_decoding_input(logits, scheduler_output)
+            logits = self.structured_decode(require_struct_decoding,
+                                            grammar_bitmask_padded, logits,
+                                            arange)
+        selected_token_ids = self.sample_from_logits(logits,
+                                                     tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
         # Update the cache state concurrently. Code above will not block until
         # we use `selected_token_ids`. Add mark_step if post-processing changes
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
+        discard_sampled_tokens_req_indices = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -632,6 +811,10 @@ class TPUModelRunner:
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
+
         assert all(
             req_id is not None for req_id in
             self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
@@ -645,11 +828,19 @@ class TPUModelRunner:
         if max_gen_len == 1:
             valid_sampled_token_ids = selected_token_ids.tolist()
 
+            # Mask out the sampled tokens that should not be sampled.
+            # TODO: Keep in sync with gpu_model_runner.py, in particular
+            #       the "else" case here
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
+
+            # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
                 self.input_batch.num_tokens[i] += 1
+
         else:
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
@@ -672,12 +863,11 @@ class TPUModelRunner:
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
-        # Check there is no new graph compilation, all the graphs should be
-        # captured and compiled during warming up.
-        if self.check_recompilation and not self.enforce_eager:
-            curr_cached_graph = xr.get_num_cached_compilation_graph()
-            assert self.num_xla_graphs == curr_cached_graph, (
-                "Recompilation after warm up is detected.")
+
+        # Check there are no new graphs compiled - all the graphs should be
+        # captured and compiled during warm up.
+        self._verify_num_xla_graphs("execute_model")
+
         return model_runner_output
 
     def load_model(self) -> None:
@@ -698,17 +888,15 @@ class TPUModelRunner:
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
             model = get_model(vllm_config=self.vllm_config)
-        model = model.eval()
+        # Sync all pending XLA execution during model initialization and weight
+        # loading.
         xm.mark_step()
         xm.wait_device_ops()
-        model = ModelWrapperV1(model)
-        self.model = torch.compile(model,
-                                   backend="openxla",
-                                   fullgraph=True,
-                                   dynamic=False)
+        self.model = model
+        self.sampler = TPUSampler()
 
     @torch.no_grad()
-    def _dummy_run(self, kv_caches, num_tokens: int) -> None:
+    def _dummy_run(self, num_tokens: int) -> None:
         if self.is_multimodal_model:
             input_ids = None
             inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
@@ -757,64 +945,249 @@ class TPUModelRunner:
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(input_ids=input_ids,
-                       positions=position_ids,
-                       kv_caches=kv_caches,
-                       inputs_embeds=inputs_embeds)
+            out = self.model(input_ids=input_ids,
+                             positions=position_ids,
+                             inputs_embeds=inputs_embeds)
+        self._hidden_states_dtype = out.dtype
 
-    def capture_model(self) -> None:
-        """Compile the model."""
+    def _precompile_mm_encoder(self) -> None:
+        # Pre-compile MM encoder for all supported data modalities.
+        hf_config = self.vllm_config.model_config.hf_config
+        for mode, max_items_by_mode in \
+            self.max_num_mm_items_by_modality.items():
+            logger.info(
+                "Compiling Multimodal %s Encoder with different input"
+                " shapes.", mode)
+            start = time.perf_counter()
+            # No padding for MM encoder just yet.
+            for num_items in range(1, max_items_by_mode + 1):
+                logger.info("  -- mode: %s items: %d", mode, num_items)
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    mode, num_items)
+                # Run multimodal encoder.
+                xm.mark_step()
+                mm_embeds = self.model.\
+                    get_multimodal_embeddings(**batched_dummy_mm_inputs)
+                xm.mark_step()
+                num_patches = mm_embeds[0].shape[0]
+                items_size = num_patches * num_items
 
+                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # embeddings are present. We assume `--disable-mm-chunked`,
+                # hence only whole items can be scheduled. This implies we just
+                # need to compile when `num_items` fit the (padded) `input_ids`
+                for num_tokens in self.num_tokens_paddings:
+                    if num_tokens >= items_size:
+                        # XLA Workaround: if torch.zeros(..device) is used, XLA
+                        # compiles a scalar+expansion op, which won't match
+                        # the graph generated at runtime. CPU->TPU must be used
+                        placeholders_ids = torch.zeros(num_tokens,
+                                                       dtype=torch.int32,
+                                                       device="cpu")
+                        # Align placeholders and actual num mm_embeddings.
+                        placeholders_ids[:items_size] = \
+                            hf_config.image_token_index
+
+                        placeholders_ids = placeholders_ids.to(self.device)
+                        # Assign outputs or the graph will be cut short.
+                        a, b = self._get_model_inputs(placeholders_ids,
+                                                      [mm_embeds])
+                        assert a is None
+                        xm.mark_step()
+
+            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # present. Chunk is only made of text, no mm_placeholders.
+            for num_tokens in self.num_tokens_paddings:
+                placeholders_ids = torch.zeros(num_tokens,
+                                               dtype=torch.int32,
+                                               device="cpu")
+                placeholders_ids = placeholders_ids.to(self.device)
+                a, b = self._get_model_inputs(placeholders_ids, [])
+                assert a is None
+                xm.mark_step()
+
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal %s Encoder compilation finished in in %.2f "
+                "[secs].", mode, end - start)
+
+    def _precompile_backbone(self) -> None:
         logger.info("Compiling the model with different input shapes.")
-
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
-            self._dummy_run(self.kv_caches, num_tokens)
-            xm.mark_step()
+            self._dummy_run(num_tokens)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("model backbone")
 
-        logger.info("Compiling sampling with different input shapes.")
+    def _precompile_select_hidden_states(self) -> None:
+        # Compile hidden state selection function for bucketed
+        # n_tokens x max_num_reqs. Graph is really small so this is fine.
+        logger.info(
+            "Compiling select_hidden_states with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
-        device = self.device
-        # Compile sampling step for different model+sampler outputs in bucketed
-        # n_tokens x max_num_reqs. Graph is really small so this is fine.
         for num_tokens in self.num_tokens_paddings:
-            num_reqs_to_sample = MIN_NUM_SEQS
-            dummy_hidden = torch.randn((num_tokens, hsize),
-                                       device=device,
-                                       dtype=torch.bfloat16)
-            while True:
-                indices = torch.zeros(
-                    num_reqs_to_sample,
-                    dtype=torch.int32,
-                    device=device,
-                )
-                xm.mark_step()
-                sampling_meta = TPUSupportedSamplingMetadata.\
-                    from_input_batch(self.input_batch, indices)
+            dummy_hidden = torch.zeros((num_tokens, hsize),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            for num_reqs in self.num_reqs_paddings:
+                indices = torch.zeros(num_reqs,
+                                      dtype=torch.int32,
+                                      device=self.device)
+                torch._dynamo.mark_dynamic(indices, 0)
+                self.select_hidden_states(dummy_hidden, indices)
                 logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
-                            num_reqs_to_sample)
-                out = self.model.sample_from_hidden(dummy_hidden,
-                                                    sampling_meta)
-                out = out.cpu()
-                if num_reqs_to_sample >= self.max_num_reqs:
+                            num_reqs)
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs >= min(num_tokens, self.max_num_reqs):
                     break
-                num_reqs_to_sample *= 2
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
-        # Record the number cached XLA graph after warming up, this will be
-        # used for checking there is no additional graph compilation during
-        # runtime execution.
-        if self.check_recompilation:
-            total_cached_graphs = xr.get_num_cached_compilation_graph()
-            num_compiled_graphs = total_cached_graphs - self.num_xla_graphs
-            logger.info("Compiled %d XLA graphs.", num_compiled_graphs)
-            self.num_xla_graphs += num_compiled_graphs
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("select_hidden_states")
+
+    def _precompile_compute_logits(self) -> None:
+        logger.info("Compiling compute_logits with different input shapes.")
+        start = time.perf_counter()
+        hsize = self.model_config.get_hidden_size()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_hidden = torch.zeros((num_reqs, hsize),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            self.compute_logits(dummy_hidden)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("compute_logits")
+
+    def _precompile_structured_decoding(self) -> None:
+        logger.info(
+            "Compiling structured_decoding with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_require_struct_decoding = \
+                self.require_structured_out_cpu[:num_reqs].to(self.device)
+            dummy_grammar_bitmask = \
+                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            # The first dimension of the above 3 dummy tensors cannot be
+            # mark_dynamic because some operations in structured_decode require
+            # them to be static.
+            arange = self.structured_decode_arange.to(self.device)
+            self.structured_decode(dummy_require_struct_decoding,
+                                   dummy_grammar_bitmask, dummy_logits, arange)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("structured_decoding")
+
+    def _precompile_sample_from_logits(self) -> None:
+        logger.info(
+            "Compiling sample_from_logits with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            # The first dimension of dummy_logits cannot be mark_dynamic
+            # because some operations in the sampler require it to be static.
+            for all_greedy in [False, True]:
+                generate_params_if_all_greedy = not all_greedy
+                sampling_metadata = (
+                    TPUSupportedSamplingMetadata.from_input_batch(
+                        self.input_batch,
+                        num_reqs,
+                        self.device,
+                        generate_params_if_all_greedy,
+                    ))
+                sampling_metadata.all_greedy = all_greedy
+                self.sample_from_logits(dummy_logits, sampling_metadata)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sample_from_logits")
+
+    def capture_model(self) -> None:
+        """
+        Precompile all the subgraphs with possible input shapes.
+        """
+        self._precompile_mm_encoder()
+        self._precompile_backbone()
+        self._precompile_select_hidden_states()
+        self._precompile_compute_logits()
+        self._precompile_structured_decoding()
+        self._precompile_sample_from_logits()
+
+    def profile_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            dummy_data_modality, max_num_mm_items = max(
+                self.max_num_mm_items_by_modality.items(), key=lambda t: t[1])
+
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %d tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                dummy_data_modality, max_num_mm_items)
+
+            # Run multimodal encoder.
+            # Isolate encoder graph from post-processing to minimize
+            # impact of recompilation until it's fixed.
+            start = time.perf_counter()
+            xm.mark_step()
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            xm.mark_step()
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal Encoder profiling finished in in %.2f [secs].",
+                end - start)
+
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Trigger compilation for general shape.
+        self._dummy_run(num_tokens)
+
+        xm.mark_step()
+        xm.wait_device_ops()
+        self.encoder_cache.clear()
+        gc.collect()
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
@@ -836,18 +1209,17 @@ class TPUModelRunner:
                 tensor_config = kv_cache_config.tensors[layer_name]
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
 
-                    tpu_k_cache = torch.zeros(kv_cache_shape,
-                                              dtype=dtype,
-                                              device=self.device)
-                    tpu_v_cache = torch.zeros_like(tpu_k_cache)
+                    tpu_kv_cache = torch.zeros(kv_cache_shape,
+                                               dtype=dtype,
+                                               device=self.device)
 
-                    kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                    kv_caches[layer_name] = tpu_kv_cache
                 else:
                     raise NotImplementedError
 
@@ -856,73 +1228,58 @@ class TPUModelRunner:
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
 
+    def reset_dynamo_cache(self):
+        if self.is_multimodal_model:
+            compiled_model = self.model.get_language_model().model
+        else:
+            compiled_model = self.model.model
+        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+            logger.info("Clear dynamo cache and cached dynamo bytecode.")
+            torch._dynamo.eval_frame.remove_from_cache(
+                compiled_model.original_code_object)
+            compiled_model.compiled_codes.clear()
 
-class ModelWrapperV1(nn.Module):
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def select_hidden_states(self, hidden_states, indices_do_sample):
+        return hidden_states[indices_do_sample]
 
-    def __init__(self, model: nn.Module):
-        super().__init__()
-        self.model = model
-        self.sampler = TPUSampler()
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def compute_logits(self,
+                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.model.compute_logits(sample_hidden_states, None)
 
-    def sample(
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def sample_from_logits(
             self, logits: torch.Tensor,
-            sampling_metadata: TPUSupportedSamplingMetadata) -> SamplerOutput:
-        sampler_out = self.sampler(logits, sampling_metadata)
-        return sampler_out
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Executes the forward pass of the model.
-
-        Args:
-            input_ids: The input token IDs of shape [num_tokens].
-            positions: The input position IDs of shape [num_tokens].
-            kv_caches: The key and value caches. They can be None during the
-                memory profiling at initialization.
-            inputs_embeds: The input embeddings of shape [num_tokens,
-                hidden_size]. It is used for multimodal models.
-        """
-
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
-
-    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def sample_from_hidden(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: TPUSupportedSamplingMetadata,
-    ) -> torch.Tensor:
-        """
-        Sample with xla-friendly function. This function is to be traced 
-        separately from `forward` for lighter compilation overhead.
-        """
-        # Tensor `sample_hidden_states` is of fixed pre-compiled size.
-        sample_hidden_states = \
-            hidden_states[sampling_metadata.indices_do_sample]
-        logits = self.compute_logits(sample_hidden_states)
-        # Optimized greedy sampling branch, tracing both paths in a single pass
-        # NOTE all_greedy is a scalar, this is just an optimized if/else.
-        out_tokens = torch.where(sampling_metadata.all_greedy,
-                        torch.argmax(logits, dim=-1, keepdim=True),
-                        self.sample(logits, sampling_metadata)\
-                                            .sampled_token_ids)
+            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
+        if sampling_metadata.all_greedy:
+            out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
+        else:
+            out_tokens = self.sampler(logits,
+                                      sampling_metadata).sampled_token_ids
         return out_tokens
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
-        # SamplingMetadata here for pruning output in LogitsProcessor, disabled
-        logits = self.model.compute_logits(hidden_states, None)
-        return logits
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def structured_decode(self, require_struct_decoding: torch.Tensor,
+                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
+                          arange: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            require_struct_decoding,
+            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
+            logits)
+
+    def apply_grammar_bitmask(self, logits: torch.Tensor,
+                              grammar_bitmask: torch.Tensor,
+                              arange: torch.Tensor):
+        assert (logits.shape[0] == grammar_bitmask.shape[0])
+        logits_cloned = logits.clone()
+        for i in range(logits.shape[0]):
+            unpacked_bitmask = (torch.bitwise_right_shift(
+                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            logits_cloned[i] = logits_cloned[i].masked_fill(
+                unpacked_bitmask, -float("inf"))
+        return logits_cloned
 
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
@@ -930,32 +1287,128 @@ class ModelWrapperV1(nn.Module):
     def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
+    def prepare_structured_decoding_input(
+        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        assert grammar_bitmask is not None
+        num_reqs, _ = logits.shape
 
-def _get_padded_number(n: int, multiple: int) -> int:
-    return ((n + multiple - 1) // multiple) * multiple
+        # Reset pre-allocated tensors
+        self.grammar_bitmask_cpu.zero_()
+        self.require_structured_out_cpu.zero_()
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the tpu runner is
+        # ordering the requests in the batch. We need to match the order of
+        # bitmask with the order of requests
+        struct_out_indices: list[int] = []
+        mask_indices: list[int] = []
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            struct_out_indices.append(batch_index)
+            mask_indices.append(mask_index)
+        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
+            grammar_bitmask[mask_indices])
+        # It's not guaranteed that all requests in this batch require
+        # structured output, so create a bool tensor to represent
+        # the requests that need structured output.
+        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
+        self.require_structured_out_cpu[struct_out_indices] = True
+        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
+            self.structured_decode_arange.to(logits.device)
+
+    def _get_mm_dummy_batch(self, modality: str,
+                            batch_size: int) -> BatchedTensorInputs:
+        # Dummy data for pre-compiling multimodal models.
+        dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            model_config=self.model_config,
+            seq_len=self.max_num_tokens,
+        )
+        dummy_mm_data = dummy_request_data.multi_modal_data
+
+        # Dummy data definition in V0 may contain multiple multimodal items
+        # (e.g, multiple images) for a single request, therefore here we
+        # always replicate first item by max_num_mm_items times since in V1
+        # they are scheduled to be processed separately.
+        assert isinstance(dummy_mm_data, MultiModalKwargs), (
+            "Expected dummy multimodal data to be of type "
+            f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. "
+            "This is most likely due to the model not having a merged "
+            "processor.")
+
+        # When models have a merged processor, their dummy data is
+        # already batched `MultiModalKwargs`, therefore we take the first
+        # `MultiModalKwargsItem` from the desired modality to profile on.
+        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+
+        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
+                                                         batch_size)
+        return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs,
+                                          device=self.device)
 
 
-def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
+def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
+    logger.info("Preparing request paddings:")
+    # assert min_req_size is power of 2
+    assert (min_req_size & (min_req_size - 1) == 0) and min_req_size > 0
+    paddings: list = []
+    num = max(MIN_NUM_SEQS, min_req_size)
+    while num <= max_req_size and (len(paddings) == 0 or paddings[-1] != num):
+        paddings.append(num)
+        logger.info("    %d", num)
+        num = _get_padded_num_reqs_with_upper_limit(num + 1, max_req_size)
+    return paddings
+
+
+def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
     res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
     return min(res, upper_limit)
 
 
-def _get_paddings(min_token_size: int, max_token_size: int,
-                  padding_gap: int) -> list[int]:
+def _get_token_paddings(min_token_size: int, max_token_size: int,
+                        padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-    first increase the size to twice, 
-    then increase the padding size by padding_gap.
+    
+    If padding_gap == 0 then:
+        increase 2X each time (exponential)
+    else:
+        first increase the size to twice, 
+        then increase the padding size by padding_gap.
     """
+    # assert min_token_size is power of 2
+    assert (min_token_size & (min_token_size - 1) == 0) and min_token_size > 0
     paddings = []
     num = min_token_size
-    while num <= padding_gap:
-        paddings.append(num)
-        num *= 2
-    num //= 2
-    while num < max_token_size:
-        num += padding_gap
-        paddings.append(num)
+
+    if padding_gap == 0:
+        logger.info("Using exponential token paddings:")
+        while True:
+            logger.info("    %d", num)
+            paddings.append(num)
+            if num >= max_token_size:
+                break
+            num *= 2
+    else:
+        logger.info("Using incremental token paddings:")
+        while num <= padding_gap:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+        num //= 2
+        while num < max_token_size:
+            num += padding_gap
+            logger.info("    %d", num)
+            paddings.append(num)
+
     return paddings
 
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9a380373d4617..2204f037a6d50 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import bind_kv_cache
@@ -84,6 +84,12 @@ class TPUWorker:
 
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
+        # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
+        # ring, the xla tpu compiler flag
+        # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
+        # fix this. It will be removed after the bug in XLA compiler is fixed.
+        os.environ["LIBTPU_INIT_ARGS"] = (
+            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
@@ -105,17 +111,24 @@ class TPUWorker:
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
-        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
-        # 30-40 graphs for decode. 128 is an arbitrary safe number.
+        # TODO (NickLucche) On gsm we compile 80+ graphs.
+        # Re-evaluate limit, with MM we may get close to this limit.
         torch._dynamo.config.cache_size_limit = 128
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): Set per-rank cache path since different ranks
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = TPUModelRunner(self.vllm_config, self.device)
@@ -124,17 +137,18 @@ class TPUWorker:
         kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_spec = self.model_runner.get_kv_cache_spec()
         for layer_name, layer_spec in kv_cache_spec.items():
-            if isinstance(layer_spec, FullAttentionSpec):
+            if isinstance(layer_spec, AttentionSpec):
                 dtype = layer_spec.dtype
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
                 # it by reference, rather by specializing on the value ``None``.
-                tpu_k_cache = torch.tensor([], dtype=dtype, device=self.device)
-                tpu_v_cache = torch.tensor([], dtype=dtype, device=self.device)
-
-                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                tpu_kv_cache = torch.tensor([],
+                                            dtype=dtype,
+                                            device=self.device)
+                kv_caches[layer_name] = tpu_kv_cache
             else:
-                raise NotImplementedError
+                raise NotImplementedError(
+                    f"Unsupported KV cache spec '{type(layer_spec)}'")
 
         runner_kv_caches: list[torch.Tensor] = []
         bind_kv_cache(
@@ -142,19 +156,31 @@ class TPUWorker:
             self.vllm_config.compilation_config.static_forward_context,
             runner_kv_caches)
 
-        self.model_runner._dummy_run(
-            runner_kv_caches,
-            num_tokens=self.scheduler_config.max_num_batched_tokens,
-        )
+        # `max_num_tokens >= max_num_batched_tokens` due to padding.
+        self.model_runner.profile_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
 
+        # During the profiling run, the model runs without KV cache. After
+        # the profiling run, the model always runs with KV cache. Here we clear
+        # the dynamo cache and cached bytecode to ensure the model always has
+        # one compiled bytecode. Having one FX graph/cached bytecode per
+        # compiled model is required for `support_torch_compile` decorator to
+        # skip dynamo guard.
+        self.model_runner.reset_dynamo_cache()
+
         # Get the maximum amount of memory used by the model weights and
         # intermediate activations.
         m = xm.get_memory_info(self.device)
         total_memory_size = m["bytes_limit"]
-        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
+        current_mem = m["bytes_used"]
+        # Ideally we would use profiled = m["peak_bytes_used"] to
+        # get weights + activations. But there is memory used during
+        # compilation / weight loading that impacts the peak and
+        # there is no way to reset peak memory in XLA, So we
+        # use the heuristic of 2% of weights.
+        profiled = current_mem * 1.02
 
         # Calculate the TPU KV cache size based on profiling.
         usable_memory_size = int(total_memory_size *
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
new file mode 100644
index 0000000000000..e46ca0c90fe38
--- /dev/null
+++ b/vllm/v1/worker/utils.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: object,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    """
+    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+
+def scatter_mm_placeholders(
+    embeds: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Scatter the multimodal embeddings into a contiguous tensor that represents
+    the placeholder tokens.
+
+    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+
+    Args:
+        embeds: The multimodal embeddings.
+          Shape: `(num_embeds, embed_dim)`
+        is_embed: A boolean mask indicating which positions in the placeholder
+          tokens need to be filled with multimodal embeddings.
+          Shape: `(num_placeholders, num_embeds)`
+    """
+    if is_embed is None:
+        return embeds
+
+    placeholders = embeds.new_full(
+        (is_embed.shape[0], embeds.shape[-1]),
+        fill_value=torch.nan,
+    )
+    placeholders[is_embed] = embeds
+    return placeholders
+
+
+def gather_mm_placeholders(
+    placeholders: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Reconstructs the embeddings from the placeholder tokens.
+
+    This is the operation of :func:`scatter_mm_placeholders`.
+    """
+    if is_embed is None:
+        return placeholders
+
+    return placeholders[is_embed]
diff --git a/vllm/version.py b/vllm/version.py
index ab5909b101a09..8329d7becb683 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str):
         return True
 
     # Note - this won't do the right thing when we release 1.0!
+    assert __version_tuple__[0] == 0
+    assert isinstance(__version_tuple__[1], int)
     return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
+
+
+def _prev_minor_version():
+    """For the purpose of testing, return a previous minor version number."""
+    # In dev tree, this will return "0.-1", but that will work fine"
+    assert isinstance(__version_tuple__[1], int)
+    return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 9f4b18869bdfa..f22d45ed82875 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -382,11 +382,17 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
 
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
             token_ids = seq_data.get_token_ids()
 
@@ -398,6 +404,8 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
                     video_grid_thw=video_grid_thw,
                     second_per_grid_ts=second_per_grid_ts,
                     context_len=computed_len,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b93aae9c91bd6..1436a404335a0 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """A CPU worker class."""
+import os
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -106,7 +107,7 @@ class CPUCacheEngine:
         num_layers = model_config.get_num_layers(parallel_config)
 
         key_cache_block = block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_layers * (key_cache_block + value_cache_block)
         if cache_dtype == "auto":
             dtype = model_config.dtype
@@ -139,6 +140,8 @@ class CPUWorker(LocalOrDistributedWorkerBase):
 
         self.local_rank = local_rank
         self.rank = rank
+        vllm_config.parallel_config.rank = rank
+
         self.distributed_init_method = distributed_init_method
 
         self.is_driver_worker = is_driver_worker
@@ -217,6 +220,10 @@ class CPUWorker(LocalOrDistributedWorkerBase):
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
         self.device = torch.device("cpu")
         self.init_distributed_environment()
         # Set random seed.
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5f39f2fa4947c..72ff9d66a6898 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -16,6 +16,7 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
@@ -34,6 +35,7 @@ from vllm.worker.model_runner_base import (
 from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
 
 logger = init_logger(__name__)
+LORA_WARMUP_RANK = 8
 
 
 @dataclasses.dataclass(frozen=True)
@@ -160,7 +162,11 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in "
                              "EncoderDecoderModelRunner")
-
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
         if (model_input.attn_metadata is not None
                 and model_input.attn_metadata.prefill_metadata is None
                 and model_input.attn_metadata.decode_metadata.use_cuda_graph):
@@ -268,6 +274,22 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
         max_num_seqs = self.scheduler_config.max_num_seqs
 
+        # This represents the maximum number of different requests
+        # that will have unique loras, and therefore the max amount of
+        # memory consumption. Create dummy lora request copies from the
+        # lora request passed in, which contains a lora from the lora
+        # warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config:
+            dummy_lora_requests = self._add_dummy_loras(
+                self.lora_config.max_loras)
+            assert len(dummy_lora_requests) == self.lora_config.max_loras
+            dummy_lora_requests_per_seq = [
+                dummy_lora_requests[idx % len(dummy_lora_requests)]
+                for idx in range(max_num_seqs)
+            ]
+
         # Profile memory usage with max_num_sequences sequences and the total
         # number of tokens equal to max_num_batched_tokens.
         seqs: List[SequenceGroupMetadata] = []
@@ -315,6 +337,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
                 block_tables=None,
                 encoder_seq_data=encoder_dummy_data.seq_data,
                 cross_block_table=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
                 multi_modal_data=decoder_dummy_data.multi_modal_data
                 or encoder_dummy_data.multi_modal_data,
                 multi_modal_placeholders=decoder_dummy_data.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 6b1593eb8235c..2d31024b47d0a 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -32,6 +32,7 @@ from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
+from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import get_world_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -44,11 +45,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.sampling_metadata import SequenceGroupToSample
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, SequenceData,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           Logprob, SequenceData, SequenceGroupMetadata,
+                           SequenceOutput)
 from vllm.utils import (bind_kv_cache, is_pin_memory_available,
                         make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -71,6 +74,8 @@ _PAD_BLOCK_ID = 0
 
 LORA_WARMUP_RANK = 8
 
+DUMMY_TOKEN_ID = -1
+
 
 class Singleton(type):
     _instances: Dict[type, object] = {}
@@ -100,7 +105,10 @@ def subtuple(obj: object,
     if to_override is None:
         to_override = {}
     fields = set(to_copy) | set(to_override.keys())
-    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if type(obj) is dict:
+        values = {key: obj[key] for key in fields if key in obj}
+    else:
+        values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
     if typename not in _TYPE_CACHE:
         _TYPE_CACHE[typename] = collections.namedtuple(typename,
                                                        ' '.join(fields))
@@ -533,6 +541,8 @@ class ModelInputForHPU(ModelRunnerInputBase):
     virtual_engine: int = 0
     lora_ids: Optional[List[int]] = None
     async_callback: Optional[Callable] = None
+    is_first_multi_step: bool = True
+    is_last_step: bool = True
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -545,6 +555,8 @@ class ModelInputForHPU(ModelRunnerInputBase):
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
             "lora_ids": self.lora_ids,
+            "is_first_multi_step": self.is_first_multi_step,
+            "is_last_step": self.is_last_step,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -656,6 +668,12 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
         self._set_gc_threshold()
         self.use_contiguous_pa = envs.VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
 
+        # For multi-step scheduling
+        self.cached_step_outputs: List[torch.Tensor] = []
+        # For delayed sampling
+        self.cached_step_inputs: List[
+            ModelInputForHPUWithSamplingMetadata] = []
+
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
         # for comprehensive description of gc generations.
@@ -758,6 +776,12 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
+        return htorch.hpu.wrap_in_hpu_graph(
+            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+        ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
+            *args, **kwargs)
+
     def get_model(self) -> nn.Module:
         return self.model
 
@@ -1005,6 +1029,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        output=None,
     ) -> PrepareDecodeMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
@@ -1035,8 +1060,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
 
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
+                if output is None:
+                    generation_token = seq_data.get_last_token_id()
+                    input_tokens.append([generation_token])
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
@@ -1047,6 +1073,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
                 seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
+                num_fully_occupied_blocks = position // self.block_size
+                block_table = block_table[:num_fully_occupied_blocks + 1]
+
                 if len(block_table) == 0:
                     block_number = _PAD_BLOCK_ID
                 else:
@@ -1066,9 +1095,14 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)
+        if output is None:
+            input_tokens = torch.tensor(input_tokens,
+                                        dtype=torch.long,
+                                        device=self.device)
+        else:
+            real_batch_size = len(seq_group_metadata_list)
+            input_tokens = output[:real_batch_size]
+
         input_positions = torch.tensor(input_positions,
                                        dtype=torch.long,
                                        device=self.device)
@@ -1462,7 +1496,27 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
             profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, None, warmup_mode=True)
+            is_single_step = \
+                self.vllm_config.scheduler_config.num_scheduler_steps == 1
+            if is_prompt or is_single_step:
+                self.execute_model(inputs, None, warmup_mode=True)
+            else:  # decode with multi-step
+                inputs = dataclasses.replace(inputs,
+                                             is_first_multi_step=True,
+                                             is_last_step=False)
+                self.execute_model(inputs,
+                                   None,
+                                   warmup_mode=True,
+                                   num_steps=2,
+                                   seqs=seqs)
+                inputs = dataclasses.replace(inputs,
+                                             is_first_multi_step=False,
+                                             is_last_step=True)
+                self.execute_model(inputs,
+                                   None,
+                                   warmup_mode=True,
+                                   num_steps=2,
+                                   seqs=seqs)
             torch.hpu.synchronize()
             if profiler:
                 profiler.step()
@@ -1977,6 +2031,21 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
 
         return lora_mask, lora_logits_mask
 
+    def _get_seq_ids(self, model_input):
+        return ([
+            sg.seq_ids[0] for sg in model_input.sampling_metadata.seq_groups
+        ])
+
+    def _pad_to_max_num_seqs(self, tensor, value):
+        padding_needed = self.max_num_seqs - tensor.size(0)
+        if padding_needed:
+            padding = torch.full((padding_needed, *tensor.shape[1:]),
+                                 value,
+                                 device=tensor.device,
+                                 dtype=tensor.dtype)
+            tensor = torch.cat([tensor, padding])
+        return tensor
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1985,115 +2054,344 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
         warmup_mode=False,
+        seqs=None,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "num_steps > 1 is not supported in HPUModelRunner")
+        VLLM_DELAYED_SAMPLING = envs.VLLM_HPU_USE_DELAYED_SAMPLING
+        use_delayed_sampling = VLLM_DELAYED_SAMPLING and not warmup_mode
+        assert not (use_delayed_sampling and num_steps != 1), \
+            'Delayed sampling is not compatible with MSS!'
+        assert model_input.input_tokens is not None
+        if use_delayed_sampling and not model_input.is_prompt and \
+                self.is_driver_worker:
+            num_cached = len(self.cached_step_outputs)
+            assert num_cached > 0
+            cur_seq_ids = self._get_seq_ids(model_input)
+            cur_seq_id_pos = {
+                sid: idx
+                for idx, sid in enumerate(cur_seq_ids) if sid >= 0
+            }
+            htorch.core.mark_step()
+            for i in range(num_cached):
+                prev_seq_ids = self._get_seq_ids(self.cached_step_inputs[i])
+                target_indices = [
+                    cur_seq_id_pos.get(psi, -1) for psi in prev_seq_ids
+                ]
+                padding = self.cached_step_outputs[i].size(0) - len(
+                    target_indices)
+                target_indices.extend([-1] * padding)
+                target_indices = torch.tensor(
+                    target_indices,
+                    device=model_input.input_tokens.device,
+                    dtype=model_input.input_tokens.dtype)
+                model_input.input_tokens.index_copy_(
+                    0, target_indices, self.cached_step_outputs[i])
+                htorch.core.mark_step()
 
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-        input_tokens = model_input.input_tokens
-        input_positions = model_input.input_positions
-        attn_metadata = model_input.attn_metadata
-        sampling_metadata = model_input.sampling_metadata
-        real_batch_size = model_input.real_batch_size
-        batch_size_padded = model_input.batch_size_padded
-        assert input_tokens is not None
-        assert input_positions is not None
-        assert sampling_metadata is not None
-        assert attn_metadata is not None
-        is_prompt = attn_metadata.is_prompt
-        assert is_prompt is not None
-        batch_size = input_tokens.size(0)
-        seq_len = self._seq_len(attn_metadata)
-        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
-        self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
+        if not model_input.is_first_multi_step:
+            if not model_input.is_last_step:
+                # not first or last multi-step
+                return []
+            # last multi-step
+            output = self._decode_sampler_outputs(
+                model_input) if self.is_driver_worker else []
+            torch.hpu.synchronize()
+        if model_input.is_first_multi_step:
+            # first multi-step
+            if self.lora_config:
+                assert model_input.lora_requests is not None
+                assert model_input.lora_mapping is not None
+                self.set_active_loras(model_input.lora_requests,
+                                      model_input.lora_mapping)
+            # Rank!=0 workers has is_prompt==None
+            if use_delayed_sampling and not model_input.is_prompt and \
+                    model_input.input_tokens.size(1) == 1:
+                if self.is_driver_worker:
+                    model_kwargs_broadcast_data = {
+                        "input_tokens": model_input.input_tokens
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                    input_tokens = model_input.input_tokens
 
-        lora_mask: torch.Tensor = None
-        lora_logits_mask: torch.Tensor = None
-        if self.lora_config:
-            assert model_input.lora_ids is not None
-            lora_mask, lora_logits_mask = self.create_lora_mask(
-                input_tokens, model_input.lora_ids, attn_metadata.is_prompt)
+                else:
+                    model_kwargs_broadcast_data = broadcast_tensor_dict(src=0)
+                    input_tokens = model_kwargs_broadcast_data["input_tokens"]
+            else:
+                input_tokens = model_input.input_tokens
+            input_positions = model_input.input_positions
+            attn_metadata = model_input.attn_metadata
+            sampling_metadata = model_input.sampling_metadata
+            real_batch_size = model_input.real_batch_size
+            batch_size_padded = model_input.batch_size_padded
+            assert input_tokens is not None
+            assert input_positions is not None
+            assert sampling_metadata is not None
+            assert attn_metadata is not None
+            is_prompt = attn_metadata.is_prompt
+            assert is_prompt is not None
+            batch_size = input_tokens.size(0)
+            seq_len = self._seq_len(attn_metadata)
+            use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+            self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
 
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "attn_metadata": self.trim_attn_metadata(attn_metadata),
-            "intermediate_tensors": intermediate_tensors,
-            "lora_mask": lora_mask,
-            "virtual_engine": model_input.virtual_engine,
-            **(model_input.multi_modal_kwargs or {}),
-        }
-        if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
+            lora_mask: torch.Tensor = None
+            lora_logits_mask: torch.Tensor = None
+            if self.lora_config:
+                assert model_input.lora_ids is not None
+                lora_mask, lora_logits_mask = self.create_lora_mask(
+                    input_tokens, model_input.lora_ids,
+                    attn_metadata.is_prompt)
 
-        htorch.core.mark_step()
-        if self.is_driver_worker:
-            model_event_name = ("model_"
-                                f"{'prompt' if is_prompt else 'decode'}_"
-                                f"bs{batch_size}_"
-                                f"seq{seq_len}_"
-                                f"graphs{'T' if use_graphs else 'F'}")
+            execute_model_kwargs = {
+                "input_ids": input_tokens,
+                "positions": input_positions,
+                "attn_metadata": self.trim_attn_metadata(attn_metadata),
+                "intermediate_tensors": intermediate_tensors,
+                "lora_mask": lora_mask,
+                "virtual_engine": model_input.virtual_engine,
+                **(model_input.multi_modal_kwargs or {}),
+            }
+            if htorch.utils.internal.is_lazy():
+                execute_model_kwargs.update(
+                    {"bypass_hpu_graphs": not use_graphs})
+
+            htorch.core.mark_step()
+            if self.is_driver_worker:
+                model_event_name = ("model_"
+                                    f"{'prompt' if is_prompt else 'decode'}_"
+                                    f"bs{batch_size}_"
+                                    f"seq{seq_len}_"
+                                    f"graphs{'T' if use_graphs else 'F'}")
+            else:
+                model_event_name = 'model_executable'
+            if num_steps > 1 or use_delayed_sampling:
+                # in case of multi-step scheduling
+                # we only want to pythonize in the last step
+                sampling_metadata.skip_sampler_cpu_output = True
+                self.model.model.sampler.include_gpu_probs_tensor = True
+            cache_orig_output_tokens_len: List[Dict] = []
+
+            def try_revert_dummy_output_tokens():
+                if len(cache_orig_output_tokens_len) > 0:
+                    # Reuse the original output token ids length
+                    for i, seq_group_metadata in enumerate(
+                            seq_group_metadata_list):
+                        for j, data in seq_group_metadata.seq_data.items():
+                            orig_output_tokens_len = \
+                                cache_orig_output_tokens_len[i][j]
+                            data.output_token_ids = \
+                                data.output_token_ids[:orig_output_tokens_len]
+
+            for i in range(num_steps):
+                if i != 0 and not self.is_driver_worker:
+                    broadcast_data = broadcast_tensor_dict(src=0)
+                    if 'early_exit' in broadcast_data and broadcast_data[
+                            'early_exit']:
+                        return [output] if num_steps == 1 else []
+                    execute_model_kwargs.update({
+                        "input_ids":
+                        broadcast_data["input_ids"],
+                        "positions":
+                        broadcast_data["positions"],
+                        "attn_metadata":
+                        self.trim_attn_metadata(
+                            broadcast_data["attn_metadata"])
+                    })
+                with self.profiler.record_event('internal', model_event_name):
+                    hidden_states = self.model.forward(
+                        **execute_model_kwargs,
+                        selected_token_indices=sampling_metadata.
+                        selected_token_indices)
+
+                if self.lora_config:
+                    LoraMask.setLoraMask(
+                        lora_logits_mask.index_select(
+                            0, sampling_metadata.selected_token_indices))
+
+                # Compute the logits.
+                with self.profiler.record_event(
+                        'internal',
+                    ('compute_logits_'
+                     f'{"prompt" if is_prompt else "decode"}_bs'
+                     f'{batch_size}_'
+                     f'seq{seq_len}')):
+                    if num_steps == 1:
+                        sampling_metadata.selected_token_indices = None
+                    logits = self.model.compute_logits(hidden_states,
+                                                       sampling_metadata)
+                htorch.core.mark_step()
+                # Only perform sampling in the driver worker.
+                if not self.is_driver_worker:
+                    continue
+
+                if use_delayed_sampling:
+                    fake_output = self._delayed_sampler_outputs(model_input)
+
+                with self.profiler.record_event(
+                        'internal', ('sample_'
+                                     f'{"prompt" if is_prompt else "decode"}_'
+                                     f'bs{batch_size}_'
+                                     f'seq{seq_len}')):
+                    output = self.model.sample(
+                        logits=logits,
+                        sampling_metadata=sampling_metadata,
+                    )
+                    if num_steps > 1:
+                        output = output.sampled_token_ids
+                        self.cached_step_outputs.append(output)
+                    if use_delayed_sampling and self.is_driver_worker:
+                        self._patch_prev_output()
+                        output = self._pad_to_max_num_seqs(
+                            output.sampled_token_ids, DUMMY_TOKEN_ID)
+                        self.cached_step_outputs.append(output)
+                        self.cached_step_inputs.append(model_input)
+                htorch.core.mark_step()
+                if model_input.async_callback is not None:
+                    model_input.async_callback()
+                if i < num_steps - 1:
+                    if i == 0:
+                        if model_input.async_callback is not None:
+                            ctx = model_input.async_callback.keywords[  # type: ignore
+                                "ctx"]
+                            seq_group_metadata_list = \
+                                ctx.seq_group_metadata_list
+                        elif seqs is not None:
+                            seq_group_metadata_list = seqs
+                        else:
+                            raise RuntimeError(
+                                "seq_group_metadata_list is uninitialized")
+                        for i, seq_group_metadata in enumerate(
+                                seq_group_metadata_list):
+                            # Skip empty steps
+                            seq_group_metadata.state.current_step += (
+                                num_steps - 2)
+                            # Cache the original output token ids
+                            cache_orig_output_tokens_len.append({})
+                            for j, data in seq_group_metadata.seq_data.items():
+                                cache_orig_output_tokens_len[i][j] = \
+                                    len(data.output_token_ids)
+                    for seq_group_metadata in seq_group_metadata_list:
+                        for data in seq_group_metadata.seq_data.values():
+                            max_output_len = sampling_metadata.seq_groups[
+                                0].sampling_params.max_tokens
+                            if len(data.output_token_ids) < max_output_len - 1:
+                                # add a place holder for prepare_decode
+                                # arbitrary value, this could be any token
+                                dummy_token = (540, )
+                                data.output_token_ids += (dummy_token)
+                            else:
+                                broadcast_tensor_dict({'early_exit': True},
+                                                      src=0)
+                                if num_steps == 1:
+                                    return [output]
+                                else:
+                                    try_revert_dummy_output_tokens()
+                                    return []
+
+                    result = self._prepare_decode(seq_group_metadata_list,
+                                                  output=output)
+                    execute_model_kwargs.update({
+                        "input_ids":
+                        result.input_tokens,
+                        "positions":
+                        result.input_positions,
+                        "attn_metadata":
+                        self.trim_attn_metadata(result.attn_metadata)
+                    })
+                    model_kwargs_broadcast_data = {
+                        "input_ids": result.input_tokens,
+                        "positions": result.input_positions,
+                        "attn_metadata": vars(result.attn_metadata)
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                else:
+                    try_revert_dummy_output_tokens()
+
+            if self.is_driver_worker and self.profiler.enabled:
+                # Stop recording 'execute_model' event
+                self.profiler.end()
+                event_end = self.profiler.get_timestamp_us()
+                counters = self.profiler_counter_helper.get_counter_dict(
+                    cache_config=self.cache_config,
+                    duration=event_end - self.event_start,
+                    seq_len=seq_len,
+                    batch_size_padded=batch_size_padded,
+                    real_batch_size=real_batch_size,
+                    is_prompt=is_prompt)
+                self.profiler.record_counter(self.event_start, counters)
+            if num_steps == 1:
+                if self.return_hidden_states:
+                    # we only need to pass hidden states of most recent token
+                    assert model_input.sampling_metadata is not None
+                    if model_input.is_prompt:
+                        output.prefill_hidden_states = hidden_states
+                    output.hidden_states = hidden_states
+                if use_delayed_sampling:
+                    if self.is_driver_worker:
+                        return [fake_output]
+                    else:
+                        return []
+
+                return [output] if self.is_driver_worker else []
+            else:
+                return []
+        return output if type(output) is list else [output]
+
+    def _delayed_sampler_outputs(self, model_input):
+        next_token_ids = [[DUMMY_TOKEN_ID]] * len(
+            model_input.sampling_metadata.seq_groups)
+        sampler_output = self._make_decode_output(
+            next_token_ids, model_input.sampling_metadata.seq_groups)
+        return sampler_output
+
+    def _decode_sampler_outputs(self, model_input):
+        use_async_out_proc = model_input.async_callback is not None
+        sampler_outputs = []
+        num_outputs = len(self.cached_step_outputs)
+        for i in range(num_outputs):
+            next_token_ids = self.cached_step_outputs.pop(0)
+            next_token_ids = next_token_ids.cpu().tolist()
+            sampler_output = self._make_decode_output(
+                next_token_ids, model_input.sampling_metadata.seq_groups)
+            sampler_outputs.append(sampler_output)
+
+            if i < num_outputs - 1 and use_async_out_proc:
+                assert model_input.async_callback is not None
+                ctx = model_input.async_callback.keywords[  # type: ignore
+                    "ctx"]
+                ctx.append_output(
+                    outputs=[sampler_output],
+                    seq_group_metadata_list=ctx.seq_group_metadata_list,
+                    scheduler_outputs=ctx.scheduler_outputs,
+                    is_async=False,
+                    is_last_step=False,
+                    is_first_step_output=False)
+                model_input.async_callback()
+
+        if use_async_out_proc:
+            return [sampler_outputs[-1]]
         else:
-            model_event_name = 'model_executable'
-        with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model.forward(
-                **execute_model_kwargs,
-                selected_token_indices=sampling_metadata.selected_token_indices
-            )
+            return sampler_outputs
 
-        if self.lora_config:
-            LoraMask.setLoraMask(
-                lora_logits_mask.index_select(
-                    0, sampling_metadata.selected_token_indices))
-
-        # Compute the logits.
-        with self.profiler.record_event(
-                'internal', ('compute_logits_'
-                             f'{"prompt" if is_prompt else "decode"}_bs'
-                             f'{batch_size}_'
-                             f'seq{seq_len}')):
-            sampling_metadata.selected_token_indices = None
-            logits = self.model.compute_logits(hidden_states,
-                                               sampling_metadata)
-        htorch.core.mark_step()
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        if model_input.async_callback is not None:
-            model_input.async_callback()
-
-        # Sample the next token.
-        with self.profiler.record_event(
-                'internal', ('sample_'
-                             f'{"prompt" if is_prompt else "decode"}_'
-                             f'bs{batch_size}_'
-                             f'seq{seq_len}')):
-            output = self.model.sample(
-                logits=logits,
-                sampling_metadata=sampling_metadata,
-            )
-        output.outputs = output.outputs[:real_batch_size]
-        htorch.core.mark_step()
-
-        if self.is_driver_worker and self.profiler.enabled:
-            # Stop recording 'execute_model' event
-            self.profiler.end()
-            event_end = self.profiler.get_timestamp_us()
-            counters = self.profiler_counter_helper.get_counter_dict(
-                cache_config=self.cache_config,
-                duration=event_end - self.event_start,
-                seq_len=seq_len,
-                batch_size_padded=batch_size_padded,
-                real_batch_size=real_batch_size,
-                is_prompt=is_prompt)
-            self.profiler.record_counter(self.event_start, counters)
-        return [output]
+    def _make_decode_output(
+        self,
+        next_token_ids: List[List[int]],
+        seq_groups: List[SequenceGroupToSample],
+    ) -> SamplerOutput:
+        zero_logprob = Logprob(0.0)
+        sampler_outputs = []
+        batch_idx = 0
+        for seq_group in seq_groups:
+            seq_ids = seq_group.seq_ids
+            seq_outputs = []
+            for seq_id in seq_ids:
+                next_token_id = next_token_ids[batch_idx][0]
+                seq_outputs.append(
+                    SequenceOutput(seq_id, next_token_id,
+                                   {next_token_id: zero_logprob}))
+                batch_idx += 1
+            sampler_outputs.append(
+                CompletionSequenceGroupOutput(seq_outputs, None))
+        return SamplerOutput(sampler_outputs)
 
     def shutdown_inc(self):
         can_finalize_inc = False
@@ -2111,3 +2409,32 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
 
     def __del__(self):
         self.shutdown_inc()
+
+    def _patch_prev_output(self):
+        assert len(self.cached_step_inputs) == len(self.cached_step_outputs), \
+            f'''Inputs and outputs are out of sync!
+            {len(self.cached_step_inputs)} vs {len(self.cached_step_outputs)}'''
+        if len(self.cached_step_inputs) == 0:
+            return
+        model_input = self.cached_step_inputs.pop(0)
+        delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(
+            -1).tolist()
+        ctx = model_input.async_callback.keywords["ctx"]  # type: ignore
+        # If there's no output to patch with, which is usually the case when
+        # we're starting a new request after all requests are completed.
+        if len(ctx.output_queue) == 0:
+            return
+        assert len(
+            ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
+        output_data = ctx.output_queue[0]
+        assert len(output_data.outputs) == 1
+        for fake_out, real_out in zip(output_data.outputs[0], delayed_output):
+            fake_out.samples[0].output_token = real_out
+        for sg, real_out in zip(output_data.seq_group_metadata_list,
+                                delayed_output):
+            assert len(sg.seq_data) == 1
+            seq_data = list(sg.seq_data.values())[0]
+            # This is a hack. Assigning output_token_ids triggers
+            # a cache recomputation and we only need to update the last token
+            seq_data.output_token_ids_array[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 473bd901b5b23..2ebada343d0fb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -15,7 +15,7 @@ import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
@@ -23,7 +23,8 @@ from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_kv_transfer_group, get_pp_group
+from vllm.distributed import get_pp_group
+from vllm.distributed.kv_transfer import get_kv_transfer_group
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
                                              graph_capture)
 from vllm.forward_context import get_forward_context, set_forward_context
@@ -698,11 +699,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         if self.runner.model_config.uses_mrope:
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
 
             inter_data.mrope_input_positions = [None] * inter_data.n_seqs
@@ -720,6 +727,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                         second_per_grid_ts=second_per_grid_ts,
                         context_len=inter_data.context_lens[seq_idx],
                         seq_len=inter_data.seq_lens[seq_idx],
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
                 seq_data.mrope_position_delta = mrope_position_delta
@@ -1143,8 +1152,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
         if self.prompt_adapter_config:
             self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
@@ -1242,6 +1251,29 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         max_num_seqs = self.scheduler_config.max_num_seqs
         self._dummy_run(max_num_batched_tokens, max_num_seqs)
 
+    def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]:
+        assert num_loras > 0
+        assert self.lora_manager is not None
+
+        dummy_lora_requests: list[LoRARequest] = []
+        with self.lora_manager.dummy_lora_cache():
+            for idx in range(num_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+        return dummy_lora_requests
+
+    def _remove_dummy_loras(self):
+        # Remove dummy loras.
+        assert self.lora_manager is not None
+        self.remove_all_loras()
+
     def _dummy_run(self,
                    max_num_batched_tokens: int,
                    max_num_seqs: int = 1) -> None:
@@ -1251,28 +1283,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
 
             # This represents the maximum number of different requests
-            # that will have unique loras, an therefore the max amount of memory
-            # consumption create dummy lora request copies from the lora request
-            # passed in, which contains a lora from the lora warmup path.
+            # that will have unique loras, and therefore the max amount of
+            # memory consumption. Create dummy lora request copies from the
+            # lora request passed in, which contains a lora from the lora
+            # warmup path.
             dummy_lora_requests: List[LoRARequest] = []
             dummy_lora_requests_per_seq: List[LoRARequest] = []
             if self.lora_config:
-                assert self.lora_manager is not None
-                with self.lora_manager.dummy_lora_cache():
-                    for idx in range(self.lora_config.max_loras):
-                        lora_id = idx + 1
-                        dummy_lora_request = LoRARequest(
-                            lora_name=f"warmup_{lora_id}",
-                            lora_int_id=lora_id,
-                            lora_path="/not/a/real/path",
-                        )
-                        self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                         rank=LORA_WARMUP_RANK)
-                        dummy_lora_requests.append(dummy_lora_request)
-                    dummy_lora_requests_per_seq = [
-                        dummy_lora_requests[idx % len(dummy_lora_requests)]
-                        for idx in range(max_num_seqs)
-                    ]
+                dummy_lora_requests = self._add_dummy_loras(
+                    self.lora_config.max_loras)
+                assert len(dummy_lora_requests) == self.lora_config.max_loras
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
 
             # Profile memory usage with max_num_sequences sequences and the
             # total number of tokens equal to max_num_batched_tokens.
@@ -1354,9 +1378,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
             if self.lora_config:
-                # Remove dummy loras.
-                assert self.lora_manager is not None
-                self.remove_all_loras()
+                self._remove_dummy_loras()
+
             return
 
     def remove_all_loras(self):
@@ -1479,6 +1502,16 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 dtype=self.model_config.dtype,
                 device=self.device)
 
+        dummy_lora_id: Optional[int] = None
+        dummy_lora_request: LoRARequest = []
+        if self.lora_config:
+            # The goal is to capture the LoRA kernels in cuda graphs.
+            # for this purpose, as single dummy lora is sufficient.
+            dummy_lora_requests = self._add_dummy_loras(num_loras=1)
+            assert len(dummy_lora_requests) == 1
+            dummy_lora_request = dummy_lora_requests[0]
+            dummy_lora_id = dummy_lora_request.lora_int_id
+
         with self.attn_state.graph_capture(max_batch_size), graph_capture(
                 self.device) as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
@@ -1503,10 +1536,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     attn_metadata.enable_kv_scales_calculation = False
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
-                            **dict(index_mapping=[0] * batch_size,
-                                   prompt_mapping=[0] * batch_size,
+                            **dict(index_mapping=[dummy_lora_id] * batch_size,
+                                   prompt_mapping=[dummy_lora_id] * batch_size,
                                    is_prefill=False))
-                        self.set_active_loras(set(), lora_mapping)
+                        self.set_active_loras(set([dummy_lora_request]),
+                                              lora_mapping)
 
                     if self.prompt_adapter_config:
                         prompt_adapter_mapping = PromptAdapterMapping(
@@ -1562,6 +1596,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     self.graph_runners[virtual_engine][batch_size] = (
                         graph_runner)
 
+        if self.lora_config:
+            self._remove_dummy_loras()
+
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time
diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py
new file mode 100644
index 0000000000000..2c5e2eac75898
--- /dev/null
+++ b/vllm/worker/multi_step_hpu_worker.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+###############################################################################
+# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import dataclasses
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from vllm.distributed import broadcast_tensor_dict
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.hpu_model_runner import ModelInputForHPU
+from vllm.worker.hpu_worker import HPUWorker
+from vllm.worker.worker_base import WorkerInput
+
+
+class MultiStepHPUWorker(HPUWorker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cached_model_input: Optional[ModelInputForHPU] = None
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]:
+        """
+        Get the driver input and broadcast it to other workers.
+        """
+        assert self.is_driver_worker
+        assert execute_model_req.virtual_engine == 0
+
+        is_first_multi_step = execute_model_req.is_first_multi_step
+        is_last_step = execute_model_req.is_last_step
+
+        if is_first_multi_step:
+            # on first step we prepare the worker input and model input normally
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            worker_input = dataclasses.replace(
+                worker_input,
+                num_steps=execute_model_req.num_lookahead_slots + 1)
+            model_input: ModelInputForHPU = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
+
+            if execute_model_req.async_callback:
+                model_input = dataclasses.replace(
+                    model_input,
+                    async_callback=execute_model_req.async_callback)
+        else:
+            # on subsequent steps we reuse the worker input and model input
+            assert self.cached_model_input is not None
+            model_input = self.cached_model_input
+            worker_input = WorkerInput()
+
+        model_input = dataclasses.replace(
+            model_input,
+            is_first_multi_step=is_first_multi_step,
+            is_last_step=is_last_step)
+
+        if self.do_metadata_broadcast:
+            if is_first_multi_step:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_tensor_dict(broadcast_data, src=0)
+            else:
+                broadcast_data = {
+                    "is_first_multi_step": is_first_multi_step,
+                    "is_last_step": is_last_step,
+                }
+                broadcast_tensor_dict(broadcast_data, src=0)
+
+        # Returning empty dict here to keep this compatible with
+        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
+        return model_input, worker_input, {}
+
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str,
+                                                            torch.Tensor]]]:
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
+                execute_model_req)
+            if model_input.is_first_multi_step:
+                self.cached_model_input = model_input
+            return model_input, worker_input, {}
+        else:
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            if len(broadcast_data) == 2:
+                assert self.cached_model_input is not None
+                self.cached_model_input = dataclasses.replace(
+                    self.cached_model_input,
+                    is_first_multi_step=broadcast_data["is_first_multi_step"],
+                    is_last_step=broadcast_data["is_last_step"])
+                empty_worker_input = WorkerInput()
+                return self.cached_model_input, empty_worker_input, {}
+
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+            self.cached_model_input = model_input
+            return model_input, worker_input, {}
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 66911790662eb..71b4b38fb9d62 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -93,9 +93,16 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         self.profiler = None
         if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index ad94a6a4db7a3..78ea990de820c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -10,10 +10,10 @@ import torch.distributed
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.distributed import (ensure_kv_transfer_initialized,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
@@ -95,6 +95,9 @@ class Worker(LocalOrDistributedWorkerBase):
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -124,6 +127,15 @@ class Worker(LocalOrDistributedWorkerBase):
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -135,9 +147,17 @@ class Worker(LocalOrDistributedWorkerBase):
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags=tags)
+
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 39957e661c474..9d49b4385dcaa 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
+from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
             self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GiB",
+                    self.model_memory_usage / GiB_bytes)
 
     def get_model(self) -> nn.Module:
         return self.model