diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
deleted file mode 100644
index 1bce7e7fdf146..0000000000000
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
-tasks:
-- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.595
-  - name: "exact_match,flexible-extract"
-    value: 0.582
-limit: 1000
-num_fewshot: 5
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 33b7114666fa2..12f730738b8a5 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -116,24 +116,6 @@ steps:
     commands:
       - "bash .buildkite/scripts/annotate-release.sh"
 
-  - label: "Build and publish TPU release image"
-    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: tpu_queue_postmerge
-    commands:
-      - "yes | docker system prune -a"
-      - "git fetch --all"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
-      - "docker push vllm/vllm-tpu:nightly"
-      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - input: "Provide Release version here"
     id: input-release-version
     fields:
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index fde48603ad3cd..56bb5cedaa0a9 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -2,16 +2,23 @@
 
 set -ex
 
-# Get release version and strip leading 'v' if present
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
-
-if [ -z "$RELEASE_VERSION" ]; then
-  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
-  exit 1
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
 fi
 
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
-To download the wheel:
+To download the wheel (by commit):
+\`\`\`
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+\`\`\`
+
+To download the wheel (by version):
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index aa4cc7b35a543..58fd435691f4a 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -173,6 +173,14 @@ fi
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
   # assign job count as the number of shards used   
@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
         --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
         --network=host \
         --shm-size=16gb \
+        --group-add "$render_gid" \
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
@@ -217,8 +226,8 @@ else
           --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
           --network=host \
           --shm-size=16gb \
+          --group-add "$render_gid" \
           --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index c023457fb03e4..bb5ef5d624630 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -48,8 +48,8 @@ steps:
   commands:
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
-- label: Async Engine, Inputs, Utils, Worker Test # 36min
-  timeout_in_minutes: 50
+- label: Async Engine, Inputs, Utils, Worker Test # 10min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -344,7 +344,7 @@ steps:
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
@@ -616,9 +616,9 @@ steps:
   - uv pip install --system torchao==0.13.0
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+- label: LM Eval Small Models # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -627,17 +627,18 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: OpenAI API correctness # 22min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+- label: OpenAI API correctness # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
-  commands: # LMEval+Transcription WER check
-  - pytest -s entrypoints/openai/correctness/
+  commands: # LMEval
+  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
 
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
@@ -858,10 +859,10 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 50min
-  mirror_hardwares: [amdexperimental]
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  timeout_in_minutes: 70
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a020b0d276be0..a0d2076199b14 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -232,8 +232,8 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 5min
-  timeout_in_minutes: 15
+- label: EPLB Execution Test # 10min
+  timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -241,6 +241,7 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
@@ -315,6 +316,7 @@ steps:
     - vllm/
     - tests/v1
   commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     # split the test to avoid interference
     - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
@@ -347,8 +349,7 @@ steps:
     - vllm/v1/attention
     - tests/v1/attention
   commands:
-    - export VLLM_DISABLE_FLASHINFER_PREFILL=1 # TODO: FI prefill is bugged and causes incorrectness, fix this
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 
 - label: V1 Test others (CPU) # 5 mins
   source_file_dependencies:
@@ -459,18 +460,21 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -544,8 +548,11 @@ steps:
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
+  torch_nightly: true
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
   - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -924,7 +931,33 @@ steps:
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml 
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1223,6 +1256,7 @@ steps:
     - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1233,6 +1267,7 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ba08a43352154..23def076cf880 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,7 +9,7 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@@ -105,11 +105,21 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
-/vllm/v1/attention/backends/rocm*.py @gshtras
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
-/vllm/attention/ops/rocm*.py @gshtras
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+/vllm/**/*rocm* @tjtanaa
+/docker/Dockerfile.rocm* @gshtras @tjtanaa
+/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
+/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
+/csrc/rocm @gshtras @tjtanaa
+/requirements/*rocm* @tjtanaa
+/tests/**/*rocm* @tjtanaa
+/docs/**/*rocm* @tjtanaa
+/vllm/**/*quark* @tjtanaa
+/tests/**/*quark* @tjtanaa
+/docs/**/*quark* @tjtanaa
+/vllm/**/*aiter* @tjtanaa
+/tests/**/*aiter* @tjtanaa
 
 # TPU
 /vllm/v1/worker/tpu* @NickLucche
@@ -127,3 +137,8 @@ mkdocs.yaml @hmellor
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
+
+# Security guide and policies
+/docs/usage/security.md @russellb
+/SECURITY.md @russellb
+/docs/contributing/vulnerability_management.md @russellb
diff --git a/.gitignore b/.gitignore
index ffa36dee1ab9d..50070d7898fe6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -221,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
 
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
+
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bcd40e7f8ab39..e034f75a9d322 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
   rev: 0.9.1
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cb94f919f123..0e9fa63b178ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,7 +241,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling cumem allocator extension.")
   # link against cuda driver library
   list(APPEND CUMEM_LIBS CUDA::cuda_driver)
-  define_gpu_extension_target(
+  define_extension_target(
     cumem_allocator
     DESTINATION vllm
     LANGUAGE CXX
@@ -858,7 +858,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
+define_extension_target(
   _C
   DESTINATION vllm
   LANGUAGE ${VLLM_GPU_LANG}
@@ -973,7 +973,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
+define_extension_target(
   _moe_C
   DESTINATION vllm
   LANGUAGE ${VLLM_GPU_LANG}
@@ -994,7 +994,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     "csrc/rocm/skinny_gemms.cu"
     "csrc/rocm/attention.cu")
 
-  define_gpu_extension_target(
+  define_extension_target(
     _rocm_C
     DESTINATION vllm
     LANGUAGE ${VLLM_GPU_LANG}
diff --git a/README.md b/README.md
index 2e750ef8fc894..b5e230e4b9b07 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
@@ -83,7 +84,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
new file mode 100644
index 0000000000000..38e7fdcf55426
--- /dev/null
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -0,0 +1,1129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark for FlashInfer fused collective operations vs standard operations.
+
+This benchmark compares:
+1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
+2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+
+Usage with torchrun:
+    torchrun --nproc_per_node=2 benchmark_fused_collective.py
+
+"""
+
+import argparse
+import itertools
+import os
+import time
+
+import pandas as pd
+import torch  # type: ignore
+import torch.distributed as dist  # type: ignore
+
+from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    graph_capture,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm  # noqa
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8  # noqa
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape  # noqa
+from vllm.platforms import current_platform  # noqa
+
+RMS_NORM_OP = torch.ops._C.rms_norm
+FUSED_ADD_RMS_NORM_OP = torch.ops._C.fused_add_rms_norm
+RMS_NORM_STATIC_FP8_QUANT_OP = torch.ops._C.rms_norm_static_fp8_quant
+FUSED_ADD_RMS_NORM_STATIC_FP8_QUANT_OP = (
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant
+)
+SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
+
+logger = init_logger(__name__)
+
+# Try to import FlashInfer
+try:
+    import flashinfer.comm as flashinfer_comm  # type: ignore
+
+    if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
+        flashinfer_comm = None
+        logger.warning(
+            "FlashInfer comm module found but missing trtllm_allreduce_fusion"
+        )
+except ImportError:
+    flashinfer_comm = None
+    logger.warning("FlashInfer not found, only benchmarking standard operations")
+
+# Constants
+FP8_DTYPE = current_platform.fp8_dtype()
+MiB = 1024 * 1024
+
+# FlashInfer max sizes per world size
+# Enable 64MB for 2, 4, 8 world sizes to verify large input sizes
+# use --disable-oneshot to disable oneshot mode for very large input sizes
+_FI_MAX_SIZES = {
+    2: 64 * MiB,  # 64MB
+    4: 64 * MiB,  # 64MB
+    8: 64 * MiB,  # 64MB
+}
+
+# Global workspace tensor for FlashInfer
+_FI_WORKSPACE_TENSOR = None
+
+
+def setup_flashinfer_workspace(
+    world_size: int,
+    rank: int,
+    hidden_dim: int,
+    max_token_num: int,
+    use_fp32_lamport: bool = False,
+):
+    """Setup FlashInfer workspace for fused allreduce operations."""
+    global _FI_WORKSPACE_TENSOR
+
+    if flashinfer_comm is None:
+        return None, None
+
+    if world_size not in _FI_MAX_SIZES:
+        logger.warning("FlashInfer not supported for world size %s", world_size)
+        return None, None
+
+    try:
+        # Create IPC workspace
+        ipc_handles, workspace_tensor = (
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                tp_rank=rank,
+                tp_size=world_size,
+                max_token_num=max_token_num,
+                hidden_dim=hidden_dim,
+                group=get_tp_group().device_group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
+        )
+
+        _FI_WORKSPACE_TENSOR = workspace_tensor
+        return ipc_handles, workspace_tensor
+    except Exception as e:
+        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        return None, None
+
+
+def cleanup_flashinfer_workspace(ipc_handles):
+    """Cleanup FlashInfer workspace."""
+    if flashinfer_comm is None or ipc_handles is None:
+        return
+
+    try:
+        group = get_tp_group().device_group
+        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
+    except Exception as e:
+        logger.error("Failed to cleanup FlashInfer workspace: %s", e)
+
+
+class FlashInferFusedAllReduceParams:
+    """Parameters for FlashInfer fused allreduce operations."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        use_fp32_lamport: bool = False,
+        max_token_num: int = 1024,
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.use_fp32_lamport = use_fp32_lamport
+        self.trigger_completion_at_end = True
+        self.launch_with_pdl = True
+        self.fp32_acc = True
+        self.max_token_num = max_token_num
+
+    def get_trtllm_fused_allreduce_kwargs(self):
+        return {
+            "world_rank": self.rank,
+            "world_size": self.world_size,
+            "launch_with_pdl": self.launch_with_pdl,
+            "trigger_completion_at_end": self.trigger_completion_at_end,
+            "fp32_acc": self.fp32_acc,
+        }
+
+
+def flashinfer_fused_allreduce_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    allreduce_params: "FlashInferFusedAllReduceParams",
+    use_oneshot: bool,
+    norm_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm operation."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        allreduce_out=None,
+        quant_out=None,
+        scale_out=None,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=None,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    scale_factor: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    use_oneshot: bool = True,
+    norm_out: torch.Tensor | None = None,
+    quant_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=None,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=scale_factor,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor | None,
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    input_global_scale: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    quant_out: torch.Tensor,
+    use_oneshot: bool,
+    output_scale: torch.Tensor,
+    norm_out: torch.Tensor | None = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=output_scale,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=input_global_scale,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+class VllmFusedAllreduce:
+    def __init__(self, hidden_dim, dtype):
+        self.rms_eps = 1e-6
+        self.rms_norm = RMSNorm(hidden_dim, eps=self.rms_eps, dtype=dtype)
+        self.fp8_quant = QuantFP8(
+            static=True,
+            group_shape=GroupShape.PER_TENSOR,
+        )
+
+    def allreduce_rmsnorm(
+        self, input_tensor: torch.Tensor, residual: torch.Tensor | None
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        return self.rms_norm(allreduce_out, residual)
+
+    def allreduce_rmsnorm_fp8_quant(
+        self,
+        input_tensor: torch.Tensor,
+        residual: torch.Tensor | None,
+        scale_factor: torch.Tensor,
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        rms_out = self.rms_norm(allreduce_out, residual)
+        if residual is None:
+            quant_out = self.fp8_quant(rms_out, scale_factor)
+            return quant_out
+        else:
+            rms_out, residual_out = rms_out
+            quant_out = self.fp8_quant(rms_out, scale_factor)
+            return quant_out, residual_out
+
+    def allreduce_rmsnorm_fp4_quant(
+        self,
+        input_tensor: torch.Tensor,
+        residual: torch.Tensor | None,
+        input_global_scale: torch.Tensor,
+        quant_out: torch.Tensor,
+        output_scale: torch.Tensor,
+    ):
+        allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+        rms_out = self.rms_norm(allreduce_out, residual)
+        if residual is None:
+            SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale)
+            return quant_out, output_scale
+        else:
+            rms_out, residual_out = rms_out
+            SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale)
+            return quant_out, residual_out, output_scale
+
+
+def create_test_tensors(
+    num_tokens: int, hidden_dim: int, dtype: torch.dtype, use_residual: bool = True
+):
+    """Create test tensors for benchmarking."""
+    input_tensor = torch.randn(num_tokens, hidden_dim, dtype=dtype)
+    residual = (
+        torch.randn_like(input_tensor)
+        if use_residual
+        else torch.zeros_like(input_tensor)
+    )
+    rms_gamma = torch.ones(hidden_dim, dtype=dtype)
+    norm_out = None if use_residual else torch.empty_like(input_tensor)
+
+    # Quantization scales
+    scale_fp8 = torch.tensor(1.0, dtype=torch.float32)
+    scale_fp4 = torch.tensor(1.0, dtype=torch.float32)
+    quant_out_fp8 = torch.empty_like(input_tensor, dtype=FP8_DTYPE)
+    # Pre-allocate FP4 output tensors (to avoid allocation overhead in benchmarks)
+    fp4_quant_out = torch.empty((num_tokens, hidden_dim // 2), dtype=torch.uint8)
+    fp4_output_scale = torch.empty((128, 4), dtype=torch.int32)
+
+    return (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    )
+
+
+def benchmark_operation(
+    operation_func, *args, warmup: int = 5, trials: int = 20, **kwargs
+):
+    """Benchmark a single operation using CUDA graphs."""
+    # Warmup before graph capture
+    for _ in range(warmup):
+        operation_func(*args, **kwargs)
+    torch.cuda.synchronize()
+
+    # Create CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    num_op_per_cudagraph = 10
+
+    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    with graph_capture(device=device), torch.cuda.graph(graph):
+        for _ in range(num_op_per_cudagraph):
+            operation_func(*args, **kwargs)
+
+    # Graph warmup
+    torch.cuda.synchronize()
+    for _ in range(warmup):
+        graph.replay()
+
+    # Benchmark with CUDA graph
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+
+    for _ in range(trials // num_op_per_cudagraph):
+        # operation_func(*args, **kwargs)
+        graph.replay()
+
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+
+    avg_time_ms = ((end_time - start_time) / trials) * 1000
+    return avg_time_ms
+
+
+def run_benchmarks(
+    num_tokens: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    use_residual: bool,
+    allreduce_params: FlashInferFusedAllReduceParams | None,
+    quant_modes: set[str],
+    no_oneshot: bool,
+):
+    """Run all benchmarks for given configuration.
+
+    Args:
+        quant_mode: "none", "fp8_only", "fp4_only", or "all"
+    """
+    (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    ) = create_test_tensors(num_tokens, hidden_dim, dtype, use_residual)
+
+    rms_eps = 1e-6
+    results = {}
+    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
+    use_oneshot_options = [False] if no_oneshot else [True, False]
+
+    # Create RMSNorm and QuantFP8 layers once for native benchmarks
+
+    if "none" in quant_modes:
+        # Standard AllReduce + RMSNorm
+        for custom_op in ["-rms_norm", "+rms_norm"]:
+            with set_current_vllm_config(
+                VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
+            ):
+                try:
+                    suffix = (
+                        "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
+                    )
+                    time_ms = benchmark_operation(
+                        vllm_fused_allreduce.allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                    )
+                    results[f"standard_allreduce_{suffix}"] = time_ms
+                except Exception as e:
+                    logger.error("Standard AllReduce+RMSNorm failed: %s", e)
+                    results[f"standard_allreduce_{suffix}"] = float("inf")
+
+        # Standard AllReduce + RMSNorm Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
+        ):
+            try:
+                standard_allreduce_rmsnorm_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                )
+                results["standard_allreduce_rmsnorm_native_compiled"] = time_ms
+            except Exception as e:
+                logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
+                results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
+
+        # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
+                except Exception as e:
+                    logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
+                        "inf"
+                    )
+
+    if "fp8" in quant_modes:
+        # Standard AllReduce + RMSNorm + FP8 Quant
+        for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]:
+            suffix = (
+                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
+            )
+            for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
+                suffix += (
+                    "_custom_quant_fp8"
+                    if "+" in quant_fp8_custom_op
+                    else "_native_quant_fp8"
+                )
+                with set_current_vllm_config(
+                    VllmConfig(
+                        compilation_config=CompilationConfig(
+                            custom_ops=[rms_norm_custom_op, quant_fp8_custom_op]
+                        )
+                    )
+                ):
+                    try:
+                        time_ms = benchmark_operation(
+                            vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
+                            input_tensor,
+                            residual=residual,
+                            scale_factor=scale_fp8,
+                        )
+                        results[f"standard_allreduce{suffix}"] = time_ms
+                    except Exception as e:
+                        logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
+                        results[f"standard_allreduce{suffix}"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(
+                compilation_config=CompilationConfig(
+                    custom_ops=["-rms_norm", "-quant_fp8"]
+                )
+            )
+        ):
+            try:
+                standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_fp8_quant_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                    scale_factor=scale_fp8,
+                )
+                results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "Standard AllReduce+RMSNorm+FP8 Native Compiled failed: %s", e
+                )
+                results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = float(
+                    "inf"
+                )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
+                        input_tensor,
+                        norm_out=norm_out,
+                        residual=residual,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        scale_factor=scale_fp8,
+                        quant_out=quant_out_fp8,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        time_ms
+                    )
+                except Exception as e:
+                    logger.error(
+                        "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
+                        e,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        float("inf")
+                    )
+
+    if "fp4" in quant_modes and current_platform.has_device_capability(100):
+        # Standard AllReduce + RMSNorm + FP4 Quant
+        for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]:
+            suffix = (
+                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
+            )
+            with set_current_vllm_config(
+                VllmConfig(
+                    compilation_config=CompilationConfig(
+                        custom_ops=[rms_norm_custom_op]
+                    )
+                )
+            ):
+                try:
+                    time_ms = benchmark_operation(
+                        vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        input_global_scale=scale_fp4,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                    )
+                    results[f"standard_allreduce_{suffix}_fp4_quant"] = time_ms
+                except Exception as e:
+                    logger.error("Standard AllReduce+RMSNorm+FP4 failed: %s", e)
+                    results[f"standard_allreduce_{suffix}_fp4_quant"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP4 Quant Native Compiled
+        with set_current_vllm_config(
+            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
+        ):
+            try:
+                standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
+                    vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
+                    fullgraph=True,
+                    dynamic=False,
+                )
+                time_ms = benchmark_operation(
+                    standard_allreduce_rmsnorm_fp4_quant_native_compiled,
+                    input_tensor,
+                    residual=residual,
+                    quant_out=fp4_quant_out,
+                    input_global_scale=scale_fp4,
+                    output_scale=fp4_output_scale,
+                )
+                results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "Standard AllReduce+RMSNorm+FP4 Native Compiled failed: %s", e
+                )
+                results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = float(
+                    "inf"
+                )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            for use_oneshot in use_oneshot_options:
+                suffix = "_oneshot" if use_oneshot else "_twoshot"
+                try:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        input_global_scale=scale_fp4,
+                        allreduce_params=allreduce_params,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                        use_oneshot=use_oneshot,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        time_ms
+                    )
+                except Exception as e:
+                    logger.error(
+                        "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
+                        e,
+                    )
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        float("inf")
+                    )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                    input_tensor,
+                    residual=residual,
+                    norm_out=norm_out,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    input_global_scale=scale_fp4,
+                    allreduce_params=allreduce_params,
+                    quant_out=fp4_quant_out,
+                    output_scale=fp4_output_scale,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
+                    "inf"
+                )
+
+    return results
+
+
+def prepare_results_with_speedups(results_dict):
+    """Prepare results with speedup calculations based on dynamic baseline selection."""
+    prepared_results = []
+
+    # Determine the fastest baseline for each operation type
+    def get_fastest_baseline(op_name, results_dict):
+        """Get the fastest baseline between standard and native_compiled versions."""
+        if "fp8_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp8_quant",
+                "standard_allreduce_rmsnorm_fp8_quant_native_compiled",
+            ]
+        elif "fp4_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp4_quant",
+                "standard_allreduce_rmsnorm_fp4_quant_native_compiled",
+            ]
+        else:
+            candidates = [
+                "standard_allreduce_rmsnorm",
+                "standard_allreduce_rmsnorm_native_compiled",
+            ]
+
+        # Find the fastest among available candidates
+        fastest_time = float("inf")
+        fastest_baseline = None
+
+        for candidate in candidates:
+            if (
+                candidate in results_dict
+                and results_dict[candidate] != float("inf")
+                and results_dict[candidate] < fastest_time
+            ):
+                fastest_time = results_dict[candidate]
+                fastest_baseline = candidate
+
+        return fastest_baseline
+
+    # Create dynamic baseline mapping
+    dynamic_baseline_mapping = {}
+    for op_name in results_dict:
+        if (
+            op_name.startswith("flashinfer_")
+            or op_name.startswith("standard_")
+            and not op_name.endswith("_native_compiled")
+        ):
+            dynamic_baseline_mapping[op_name] = get_fastest_baseline(
+                op_name, results_dict
+            )
+
+    for op_name, time_ms in results_dict.items():
+        if time_ms == float("inf"):
+            speedup_str = "FAILED"
+            time_str = "FAILED"
+        else:
+            time_str = f"{time_ms:.3f}"
+            # Find the appropriate baseline for this operation
+            baseline_op = dynamic_baseline_mapping.get(op_name)
+            if baseline_op and baseline_op in results_dict:
+                baseline_time = results_dict[baseline_op]
+                if baseline_time != float("inf") and baseline_time > 0:
+                    speedup = baseline_time / time_ms
+                    speedup_str = f"{speedup:.2f}x"
+                else:
+                    speedup_str = "N/A"
+            else:
+                # For baseline operations, determine if this is the fastest baseline
+                if op_name.endswith("_native_compiled") or (
+                    op_name.startswith("standard_")
+                    and not op_name.endswith("_native_compiled")
+                ):
+                    fastest_baseline = get_fastest_baseline(op_name, results_dict)
+                    if fastest_baseline == op_name:
+                        speedup_str = "baseline"
+                    else:
+                        if fastest_baseline and fastest_baseline in results_dict:
+                            baseline_time = results_dict[fastest_baseline]
+                            if baseline_time != float("inf") and baseline_time > 0:
+                                speedup = baseline_time / time_ms
+                                speedup_str = f"{speedup:.2f}x"
+                            else:
+                                speedup_str = "N/A"
+                        else:
+                            speedup_str = "N/A"
+                else:
+                    speedup_str = "N/A"
+
+        prepared_results.append(
+            {
+                "operation": op_name,
+                "time_ms": time_ms,
+                "time_str": time_str,
+                "speedup_str": speedup_str,
+            }
+        )
+
+    return prepared_results
+
+
+def print_results(
+    results_dict,
+    num_tokens,
+    hidden_dim,
+    dtype,
+    use_residual,
+    quant_modes,
+    input_size_mb,
+):
+    """Print benchmark results in a formatted table."""
+    print(f"\n{'=' * 80}")
+    print(
+        f"Results: num_tokens={num_tokens}, hidden_dim={hidden_dim} "
+        f"(input size: {input_size_mb:.2f} MB)"
+    )
+    print(
+        f"dtype={dtype}, residual={'yes' if use_residual else 'no'}, "
+        f"quant_modes={','.join(sorted(list(quant_modes)))}"
+    )
+    print(f"{'=' * 80}")
+    print(f"{'Operation':<50} {'Time (ms)':<12} {'Speedup':<10}")
+    print(f"{'-' * 80}")
+
+    # Prepare results with speedup calculations
+    prepared_results = prepare_results_with_speedups(results_dict)
+
+    for result in prepared_results:
+        if result["time_ms"] == float("inf"):
+            time_display = result["time_str"]
+        else:
+            time_display = f"{result['time_ms']:.3f}"
+
+        print(
+            f"{result['operation']:<50} {time_display:<12} {result['speedup_str']:<10}"
+        )
+
+
+def format_results_markdown(
+    all_results: list[dict], world_size: int, args: argparse.Namespace
+) -> str:
+    """Format all benchmark results as markdown."""
+    lines: list[str] = []
+    lines.append("# FlashInfer Fused Collective Operations Benchmark Results")
+    lines.append("")
+    lines.append(f"**World Size:** {world_size}  ")
+    lines.append(f"**Hidden Dimension:** {args.hidden_dim}  ")
+    lines.append(f"**Warmup Iterations:** {args.warmup}  ")
+    lines.append(f"**Benchmark Trials:** {args.trials}  ")
+    modes = ",".join(all_results[0]["quant_modes"]) if all_results else "N/A"
+    lines.append(f"**Quantization Modes:** {modes}  ")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+
+    for entry in all_results:
+        num_tokens = entry["num_tokens"]
+        dtype = entry["dtype"]
+        use_residual = entry["use_residual"]
+        results_dict = entry["results"]
+        input_size_mb = entry["input_size_mb"]
+        residual_str = "with residual" if use_residual else "no residual"
+
+        lines.append(
+            f"## Configuration: num_tokens={num_tokens}, dtype={dtype}, {residual_str}"
+        )
+        lines.append(f"**Input Size:** {input_size_mb:.2f} MB")
+        lines.append("")
+
+        prepared = prepare_results_with_speedups(results_dict)
+        # Build DataFrame for markdown export
+        rows = [
+            {
+                "Operation": r["operation"].replace("_", " ").title(),
+                "Time (ms)": r["time_str"],
+                "Speedup": r["speedup_str"],
+            }
+            for r in prepared
+        ]
+        df = pd.DataFrame(rows)
+        if df.empty:
+            lines.append("No results.")
+        else:
+            lines.append(df.to_markdown(index=False))
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def save_results_to_file(
+    all_results: list[dict], world_size: int, args: argparse.Namespace, rank: int
+):
+    """Save benchmark results to markdown file (only on rank 0)."""
+    if rank != 0:
+        return
+
+    if not all_results:
+        logger.warning("No results to save")
+        return
+
+    output_path = args.output_file
+
+    try:
+        markdown_content = format_results_markdown(all_results, world_size, args)
+
+        with open(output_path, "a") as f:
+            f.write(markdown_content)
+
+    except Exception as e:
+        logger.error("Failed to save results to file: %s", e)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark fused collective operations"
+    )
+    parser.add_argument(
+        "--num-tokens",
+        type=int,
+        nargs="+",
+        default=[128, 512, 1024, 2048],
+        help="Numbers of tokens to test",
+    )
+    parser.add_argument(
+        "--hidden-dim", type=int, default=8192, help="Hidden dimension size"
+    )
+    parser.add_argument(
+        "--dtypes",
+        type=str,
+        nargs="+",
+        default=["bfloat16"],
+        choices=["float16", "bfloat16", "float32"],
+        help="Data types to test",
+    )
+    parser.add_argument(
+        "--no-residual",
+        action="store_true",
+        help="Skip residual connection tests",
+    )
+
+    parser.add_argument(
+        "--quant-modes",
+        type=str,
+        default="none,fp8,fp4",
+        help=(
+            "Comma-separated quantization modes to run: none, fp8, fp4. "
+            "Default: none,fp8,fp4"
+        ),
+    )
+
+    parser.add_argument(
+        "--warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--trials", type=int, default=20, help="Number of benchmark trials"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        help="""Output file path for markdown results 
+                (default: benchmark_results_<timestamp>.md)
+        """,
+    )
+
+    parser.add_argument(
+        "--no-oneshot",
+        action="store_true",
+        help="Skip oneshot benchmarks",
+    )
+
+    args = parser.parse_args()
+
+    # Check if running with torchrun (required for collective operations)
+    if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ:
+        raise RuntimeError(
+            "Must run with torchrun for distributed benchmarking. "
+            "Example: torchrun --nproc_per_node=2 benchmark_fused_collective.py"
+        )
+
+    # Initialize distributed environment
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Validate world size (must be > 1 for collective operations)
+    if world_size <= 1:
+        raise ValueError(
+            "World size must be > 1 for collective operations benchmarking. "
+            f"Current world size: {world_size}. Use torchrun with --nproc_per_node > 1."
+        )
+
+    # Parse quantization modes
+    valid_quant_modes = {"none", "fp8", "fp4"}
+    raw_modes = [
+        m.strip().lower() for m in (args.quant_modes or "").split(",") if m.strip()
+    ]
+    quant_modes = set(raw_modes) if raw_modes else {"none", "fp8", "fp4"}
+    invalid = sorted(list(quant_modes - valid_quant_modes))
+    if invalid:
+        raise ValueError(
+            f"Invalid --quant-modes entries: {','.join(invalid)}. "
+            f"Valid options are: {','.join(sorted(valid_quant_modes))}."
+        )
+
+    if rank == 0:
+        logger.info("Running benchmark with world_size=%s, rank=%s", world_size, rank)
+        logger.info("Quantization modes: %s", ",".join(sorted(list(quant_modes))))
+        if flashinfer_comm is not None:
+            logger.info(
+                "FlashInfer available - will benchmark fused operations",
+            )
+        else:
+            logger.info(
+                "FlashInfer not available - only benchmarking standard operations"
+            )
+
+    # Convert dtype strings to torch dtypes
+    dtype_map = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }
+    dtypes = [dtype_map[dt] for dt in args.dtypes]
+
+    # Test configurations
+    residual_options = [True] if not args.no_residual else [False]
+
+    configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
+
+    # Setup FlashInfer workspace if available
+    ipc_handles = None
+    allreduce_params = None
+
+    if flashinfer_comm is not None:
+        # Use the largest hidden dimension for workspace setup
+        max_num_token = _FI_MAX_SIZES.get(world_size) // (
+            args.hidden_dim * world_size * 2
+        )
+
+        ipc_handles, workspace_tensor = setup_flashinfer_workspace(
+            world_size, rank, args.hidden_dim, max_num_token
+        )
+
+        if workspace_tensor is not None:
+            allreduce_params = FlashInferFusedAllReduceParams(
+                rank=rank,
+                world_size=world_size,
+                max_token_num=max_num_token,
+            )
+
+    # Collect all results for markdown export
+    all_results = []
+
+    try:
+        # Run benchmarks
+        for num_tokens, dtype, use_residual in configs:
+            if rank == 0:
+                logger.info(
+                    "\nTesting:  num_tokens=%s, hidden_dim=%s, dtype=%s, residual=%s",
+                    num_tokens,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                )
+
+            results = run_benchmarks(
+                num_tokens,
+                args.hidden_dim,
+                dtype,
+                use_residual,
+                allreduce_params,
+                quant_modes=quant_modes,
+                no_oneshot=args.no_oneshot,
+            )
+
+            # Store results for markdown export
+            if rank == 0:
+                # Calculate input size in MB
+                input_size_mb = (
+                    num_tokens * args.hidden_dim * torch.finfo(dtype).bits
+                ) / (8 * 1024 * 1024)
+                all_results.append(
+                    {
+                        "num_tokens": num_tokens,
+                        "hidden_dim": args.hidden_dim,
+                        "dtype": str(dtype).replace("torch.", ""),
+                        "use_residual": use_residual,
+                        "quant_modes": sorted(list(quant_modes)),
+                        "input_size_mb": input_size_mb,
+                        "results": results,
+                    }
+                )
+
+                print_results(
+                    results,
+                    num_tokens,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                    quant_modes,
+                    input_size_mb,
+                )
+
+        # Save results to markdown file
+        if args.output_file and rank == 0:
+            save_results_to_file(all_results, world_size, args, rank)
+
+    finally:
+        # Cleanup
+        if ipc_handles is not None:
+            cleanup_flashinfer_workspace(ipc_handles)
+
+        dist.barrier()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index d525bd5faacf6..9b426d8d5f778 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = [
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
-    "nm-testing/deepseekv2-lite",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "deepseek-ai/DeepSeek-V2-Lite",
     "ibm-granite/granite-3.0-1b-a400m",
     "ibm-granite/granite-3.0-3b-a800m",
 ]
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index bf1512268fe0b..6715c9b548aa1 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -19,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm.triton_utils import HAS_TRITON
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
+from vllm.triton_utils import HAS_TRITON, triton
 
 if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+    from vllm.lora.ops.triton_ops import (  ## added fused_moe_lora
+        LoRAKernelMeta,
+        fused_moe_lora_expand,
+        fused_moe_lora_shrink,
+        lora_expand,
+        lora_shrink,
+    )
+    from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+        _LORA_PTR_DICT,  ## added _LORA_PTR_DICT for fused_moe_lora
+    )
     from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-
+from vllm import _custom_ops as ops
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import round_up
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
@@ -59,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
 DEFAULT_SORT_BY_LORA_IDS = [False, True]
 DEFAULT_SEQ_LENGTHS = [1]
 DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+DEFAULT_TOP_K_NUMS = [1]  # Added for MoE LoRA top_k
+DEFAULT_NUM_EXPERTS = [8]  # Added for MoE LoRA num_experts
 
 
 # Utilities
@@ -191,6 +204,11 @@ class OpType(Enum):
 
     LORA_SHRINK = auto()
     LORA_EXPAND = auto()
+    ## Adding support for fused moe lora
+    FUSED_MOE_LORA_GATE_UP_SHRINK = auto()  ## Gate/Up projection variant with shrink
+    FUSED_MOE_LORA_GATE_UP_EXPAND = auto()  ## Gate/Up projection variant with expand
+    FUSED_MOE_LORA_DOWN_SHRINK = auto()  ## Down projection variant with shrink
+    FUSED_MOE_LORA_DOWN_EXPAND = auto()  ## Down projection variant with expand
 
     @staticmethod
     def from_str(s: str) -> "OpType":
@@ -198,6 +216,15 @@ class OpType(Enum):
             return OpType.LORA_SHRINK
         if s.lower() == "lora_expand":
             return OpType.LORA_EXPAND
+        # Adding support for fused moe lora, both in gate_up and down
+        if s.lower() == "fused_moe_lora_gate_up_shrink":  ## Gate/Up variant with shrink
+            return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
+        if s.lower() == "fused_moe_lora_gate_up_expand":  ## Gate/Up variant with expand
+            return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
+        if s.lower() == "fused_moe_lora_down_shrink":  ## Down variant with shrink
+            return OpType.FUSED_MOE_LORA_DOWN_SHRINK
+        if s.lower() == "fused_moe_lora_down_expand":  ## Down variant with expand
+            return OpType.FUSED_MOE_LORA_DOWN_EXPAND
         raise ValueError(f"Unrecognized str {s} to convert to OpType")
 
     def is_shrink_fn(self) -> bool:
@@ -206,19 +233,56 @@ class OpType(Enum):
     def is_expand_fn(self) -> bool:
         return self in [OpType.LORA_EXPAND]
 
+    def is_fused_moe_lora_fn(self) -> bool:  ## adding for fused MoE LoRA
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_gate_up_fn(
+        self,
+    ) -> bool:  ## adding for fused MoE LoRA Gate/Up
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+        ]
+
+    def is_fused_moe_lora_down_fn(self) -> bool:  ## adding for fused MoE LoRA Down
+        return self in [
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_shrink_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]
+
+    def is_fused_moe_lora_expand_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
     def num_slices(self) -> list[int]:
+        if self.is_fused_moe_lora_gate_up_fn():
+            return [2]
+        elif self.is_fused_moe_lora_down_fn():
+            return [1]
         return [1, 2, 3]
 
     def mkn(
         self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
     ) -> tuple[int, int, int]:
         num_tokens = batch_size * seq_length
-        if self.is_shrink_fn():
+        if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
             m = num_tokens
             k = hidden_size
             n = lora_rank
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
             m = num_tokens
             k = lora_rank
             n = hidden_size
@@ -232,9 +296,36 @@ class OpType(Enum):
         """
         if self.is_shrink_fn():
             return op_dtype, op_dtype, torch.float32
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
             return torch.float32, op_dtype, op_dtype
+        else:
+            assert self.is_fused_moe_lora_fn()
+            return op_dtype, op_dtype, op_dtype
+
+    def matmul_shapes_fused_moe_lora(
+        self,
+        m: int,
+        n: int,
+        k: int,
+        num_loras: int,
+        num_slices: int,
+        top_k_num: int,
+        num_experts: int,
+    ) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
+        if self.is_fused_moe_lora_shrink_fn():
+            input_shape = (
+                (m * top_k_num, n)
+                if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+                else (m, n)
+            )
+            output_shape = (num_slices, m, top_k_num, k)
+            weight_shape = (num_loras, num_experts, k, n)
+        else:
+            assert self.is_fused_moe_lora_expand_fn()
+            input_shape = (num_slices, m, top_k_num, k)
+            output_shape = (m, top_k_num, n * num_slices)
+            weight_shape = (num_loras, num_experts, n, k)
+        return (input_shape, weight_shape, output_shape)
 
     def matmul_shapes(
         self,
@@ -244,6 +335,8 @@ class OpType(Enum):
         lora_rank: int,
         num_loras: int,
         num_slices: int,
+        top_k_num: int | None = None,
+        num_experts: int | None = None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         """
         Given num_slices, return the shapes of the A, B, and C matrices
@@ -258,6 +351,16 @@ class OpType(Enum):
         if self in [OpType.LORA_EXPAND]:
             # LoRA expand kernels support num_slices inherently in the kernel
             return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self.is_fused_moe_lora_fn():
+            return self.matmul_shapes_fused_moe_lora(
+                m,
+                k,
+                n,
+                num_loras,
+                num_slices,
+                top_k_num,
+                num_experts,
+            )
         raise ValueError(f"Unrecognized op_type {self}")
 
     def bench_fn(self) -> Callable:
@@ -265,6 +368,16 @@ class OpType(Enum):
             return lora_shrink
         if self == OpType.LORA_EXPAND:
             return lora_expand
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]:
+            return fused_moe_lora_shrink
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]:
+            return fused_moe_lora_expand
 
         raise ValueError(f"Unrecognized optype {self}")
 
@@ -318,6 +431,8 @@ class BenchmarkContext:
     sort_by_lora_id: bool
     dtype: torch.dtype
     seq_length: int | None = None
+    num_experts: int | None = None  # num_experts for MoE based ops
+    top_k_num: int | None = None  # top_k for MoE based ops
     num_slices: int | None = None  # num_slices for slice based ops
 
     def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
@@ -373,6 +488,11 @@ class BenchmarkTensors:
             f"{dtype_to_str(self.output.dtype)}"
         )
 
+    def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
+        return (
+            size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
+        )
+
     @staticmethod
     def make(
         ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
@@ -385,6 +505,8 @@ class BenchmarkTensors:
             ctx.lora_rank,
             ctx.num_loras,
             ctx.num_slices,
+            ctx.top_k_num,
+            ctx.num_experts,
         )
         a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
         input_tensor, lora_weights, output_tensor = make_rand_tensors(
@@ -432,17 +554,27 @@ class BenchmarkTensors:
             prompt_lora_indices_tensor,
         )
 
-    def sanity_check(self) -> None:
+    def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
         """
         Fails asserts when non-conformality is detected.
         """
-        num_tokens = self.input.shape[-2]
+        num_tokens = (
+            self.input.shape[1]
+            if op_type.is_fused_moe_lora_expand_fn()
+            else self.input.shape[-2]
+        )
         # check metadata tensors
-        assert torch.sum(self.seq_lens) == num_tokens
+        ## In down shrink case, each token is repeated top_k_num times
+        assert num_tokens == self.get_num_tokens(
+            torch.sum(self.seq_lens), ctx.top_k_num, op_type
+        ), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
         num_seqs = self.seq_lens.shape[0]
         # assert self.seq_start_loc.shape[0] == num_seqs
+        ## In down shrink case, each prompt corresponds to top_k_num sequences
         assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+        assert self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )
 
     def to_device(self, device: str):
         """
@@ -471,21 +603,111 @@ class BenchmarkTensors:
                 to_device(field) if field_name != "no_lora_flag_cpu" else field,
             )
 
-    def metadata(self) -> tuple[int, int, int]:
+    def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
         """
         Return num_seqs, num_tokens and max_seq_len
         """
         num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        num_tokens = self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )
         max_seq_len = torch.max(self.seq_lens).item()
         num_slices = len(self.lora_weights_lst)
         return num_seqs, num_tokens, max_seq_len, num_slices
 
-    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
-        self.sanity_check()
+    def fused_moe_lora_data_prepare(
+        self,
+        block_size: int,
+        token_lora_mapping: torch.Tensor,
+        ctx: BenchmarkContext,
+    ):
+        def moe_lora_align_block_size(
+            topk_ids: torch.Tensor,
+            token_lora_mapping: torch.Tensor,
+            block_size: int,
+            num_experts: int,
+            max_loras: int,
+            expert_map: torch.Tensor | None = None,
+            pad_sorted_ids: bool = False,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            """
+            Aligns tokens and experts into block-sized chunks for LoRA-based
+            mixture-of-experts (MoE) execution.
+            """
+            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+            if pad_sorted_ids:
+                max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            sorted_ids = torch.empty(
+                (max_loras * max_num_tokens_padded,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+            # Expert ids must be set default to -1 to prevent a blank block
+            expert_ids = torch.empty(
+                (max_loras * max_num_m_blocks,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            num_tokens_post_pad = torch.empty(
+                (max_loras), dtype=torch.int32, device=topk_ids.device
+            )
+
+            ops.moe_lora_align_block_size(
+                topk_ids,
+                token_lora_mapping,
+                num_experts,
+                block_size,
+                max_loras,
+                max_num_tokens_padded,
+                max_num_m_blocks,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+            if expert_map is not None:
+                expert_ids = expert_map[expert_ids]
+
+            return sorted_ids, expert_ids, num_tokens_post_pad
+
+        num_tokens = ctx.batch_size
+        curr_topk_ids = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+        topk_weights = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+
+        (sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
+            moe_lora_align_block_size(
+                topk_ids=curr_topk_ids,
+                token_lora_mapping=token_lora_mapping,
+                block_size=block_size,
+                num_experts=ctx.num_experts,
+                max_loras=ctx.num_loras,
+            )
+        )
+
+        sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
+        expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
+        num_tokens_post_padded = num_tokens_post_padded_lora
+        return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
+
+    def as_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
         self.to_device(self.input.device)
 
-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
 
         # Sanity check matrix shapes.
         i_shape, lw_shape, o_shape = (
@@ -520,11 +742,13 @@ class BenchmarkTensors:
             "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
         }
 
-    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        self.sanity_check()
+    def as_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
         self.to_device(self.input.device)
 
-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
 
         # Sanity check matrix shapes.
         i_shape, lw_shape, o_shape = (
@@ -561,18 +785,173 @@ class BenchmarkTensors:
             "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
         }
 
-    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: bool | None = None
+    def as_fused_moe_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
     ) -> dict[str, Any]:
-        if op_type.is_shrink_fn():
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+        # Expected input shape : [num_tokens, hidden_size] for gate_up
+        # Expected input shape : [top_k_num * num_tokens, hidden_size] for down
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == hidden_size
+        lora_rank = lw_shape[-2]
+        # Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(o_shape) == 4
+        assert (
+            o_shape
+            == (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
+            if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+            else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
+        )
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "qcurr_hidden_states": self.input,
+            "lora_a_stacked": self.lora_weights_lst,
+            "a_intermediate_cache1": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "shrink_num_warps": kernel_config["NUM_WARPS"],
+            "shrink_num_stages": kernel_config["NUM_STAGES"],
+            "shrink_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
+    def as_fused_moe_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+
+        # Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(i_shape) == 4
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[-1]
+        # Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == lora_rank
+        hidden_size = lw_shape[-2]
+        # Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
+        assert len(o_shape) == 3
+        assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
+
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "a_intermediate_cache1": self.input,
+            "lora_b_stacked": self.lora_weights_lst,
+            "output": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "max_lora_rank": lora_rank,
+            "w1_output_dim_size": lw_shape[2],
+            "expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "expand_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "expand_num_warps": kernel_config["NUM_WARPS"],
+            "expand_num_stages": kernel_config["NUM_STAGES"],
+            "expand_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
+    def bench_fn_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
+    ) -> dict[str, Any]:
+        if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
             assert add_inputs is None
         else:
             assert add_inputs is not None
 
         if op_type == OpType.LORA_SHRINK:
-            return self.as_lora_shrink_kwargs()
+            return self.as_lora_shrink_kwargs(ctx, op_type)
         if op_type == OpType.LORA_EXPAND:
-            return self.as_lora_expand_kwargs(add_inputs)
+            return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
+        if op_type.is_fused_moe_lora_shrink_fn():
+            return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
+        if op_type.is_fused_moe_lora_expand_fn():
+            return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
         raise ValueError(f"Unrecognized optype {self}")
 
     def test_correctness(
@@ -617,7 +996,7 @@ def bench_optype(
     test_correctness: bool = False,
 ) -> TMeasurement:
     assert arg_pool_size >= 1
-    if op_type.is_shrink_fn():
+    if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
         assert expand_fn_add_inputs is None
     else:
         assert expand_fn_add_inputs is not None
@@ -627,23 +1006,30 @@ def bench_optype(
         BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
     ]
     for bt in bench_tensors:
-        bt.sanity_check()
+        bt.sanity_check(ctx, op_type)
 
     # Test correctness of our implementation.
     if test_correctness:
+        assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
+            f"Correctness testing is not supported for {op_type.name}."
+        )
         assert all(
-            [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
+            [
+                bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
+                for bt in bench_tensors
+            ]
         )
 
     # BenchmarkTensors -> dict (kwargs)
     kwargs_list = [
-        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
         for bt in bench_tensors
     ]
 
     # Clear LoRA optimization hash-maps.
     _LORA_A_PTR_DICT.clear()
     _LORA_B_PTR_DICT.clear()
+    _LORA_PTR_DICT.clear()
     # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
     for kwargs in kwargs_list:
         op_type.bench_fn()(**kwargs)
@@ -793,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
 
                     # Benchmark bench_op
                     expand_fn_add_inputs = (
-                        [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                        [None]
+                        if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
+                        else args.expand_fn_add_inputs
                     )
                     for add_input_arg in expand_fn_add_inputs:
                         seq_len_timers.append(
@@ -831,12 +1219,22 @@ def as_benchmark_contexts(
     hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
 ) -> list[BenchmarkContext]:
     ctxs: list[BenchmarkContext] = []
-    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+    for (
+        batch_size,
+        hidden_size,
+        lora_rank,
+        num_loras,
+        sort_by_lora_id,
+        top_k_num,
+        num_experts,
+    ) in product(  # noqa
         args.batch_sizes,
         list(hidden_sizes),
         lora_ranks,
         args.num_loras,
         args.sort_by_lora_id,
+        args.top_k_nums,
+        args.num_experts,
     ):
         ctxs.append(
             BenchmarkContext(
@@ -851,6 +1249,8 @@ def as_benchmark_contexts(
                 seq_length=None,
                 sort_by_lora_id=sort_by_lora_id,
                 dtype=args.dtype,
+                top_k_num=top_k_num,
+                num_experts=num_experts,
                 # To be filled based on the OpType to benchmark
                 num_slices=None,
             )
@@ -1012,6 +1412,22 @@ if __name__ == "__main__":
             ),
         )
 
+        p.add_argument(
+            "--top-k-nums",
+            nargs="+",
+            type=int,
+            default=DEFAULT_TOP_K_NUMS,
+            help="Top-K values for MoE LoRA operations",
+        )
+
+        p.add_argument(
+            "--num-experts",
+            nargs="+",
+            type=int,
+            default=DEFAULT_NUM_EXPERTS,
+            help="Number of experts for MoE LoRA operations",
+        )
+
     parser = FlexibleArgumentParser(
         description=f"""
 Benchmark LoRA kernels:
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index bc6cf83bc21fd..c99951aa27826 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
     num_warps_range = [1, 2, 4, 8]
     group_m_range = [1, 4, 8, 16, 32]
     num_stage_range = [2]
-    waves_per_eu_range = [0]
+    waves_per_eu_range = [0, 1, 2, 4]
     matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
     kpack_range = [1, 2] if use_fp16 else []
 
@@ -590,6 +590,7 @@ def main(args: argparse.Namespace):
         "DeepseekV3ForCausalLM",
         "DeepseekV32ForCausalLM",
         "Glm4MoeForCausalLM",
+        "NemotronHForCausalLM",
     ):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
@@ -615,6 +616,11 @@ def main(args: argparse.Namespace):
         topk = config.moe_topk[0]
         intermediate_size = config.moe_intermediate_size[0]
         hidden_size = config.hidden_size
+    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+        E = config.thinker_config.text_config.num_experts
+        topk = config.thinker_config.text_config.num_experts_per_tok
+        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
+        hidden_size = config.thinker_config.text_config.hidden_size
     else:
         # Support for llama4
         config = config.get_text_config()
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 29ef6409bb166..074b7a440b612 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,97 +1,76 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from itertools import accumulate
+import itertools
 
-import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+batch_size_range = [2**i for i in range(0, 8, 2)]
+seq_len_range = [2**i for i in range(6, 10, 1)]
+num_heads_range = [32, 48]
+configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
 
-def benchmark_rope_kernels_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: int | None,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    # silulating serving 4 LoRAs
-    scaling_factors = [1, 2, 4, 8]
-    # batched RoPE can take multiple scaling factors
-    batched_rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-        {"rope_type": "linear", "factor": tuple(scaling_factors)},
-    )
-    # non-batched RoPE takes only one scaling factor, we create multiple
-    # instances to simulate the same behavior
-    non_batched_ropes: list[RotaryEmbedding] = []
-    for scaling_factor in scaling_factors:
-        non_batched_ropes.append(
-            get_rope(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                {"rope_type": "linear", "factor": (scaling_factor,)},
-            )
-        )
 
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
-    key = torch.randn_like(query)
-
-    # create query offsets for batched RoPE, we concat multiple kv cache
-    # together and each query needs to find the right kv cache of its type
-    offset_map = torch.tensor(
-        list(
-            accumulate(
-                [0]
-                + [
-                    max_position * scaling_factor * 2
-                    for scaling_factor in scaling_factors[:-1]
-                ]
-            )
+def get_benchmark(head_size, rotary_dim, is_neox_style, device):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "num_heads"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "flashinfer", "vllm"],
+            line_names=["PyTorch", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
+            args={},
         )
     )
-    query_types = torch.randint(
-        0, len(scaling_factors), (batch_size, seq_len), device=device
-    )
-    # map query types to offsets
-    query_offsets = offset_map[query_types]
-    # the kernel takes flattened offsets
-    flatten_offsets = query_offsets.flatten()
+    def benchmark(batch_size, seq_len, num_heads, provider):
+        dtype = torch.bfloat16
+        max_position = 8192
+        base = 10000
+        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope = rope.to(dtype=dtype, device=device)
+        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
 
-    # batched queries of the same type together for non-batched RoPE
-    queries = [query[query_types == i] for i in range(len(scaling_factors))]
-    keys = [key[query_types == i] for i in range(len(scaling_factors))]
-    packed_qkr = zip(queries, keys, non_batched_ropes)
-    # synchronize before start timing
-    torch.cuda.synchronize()
-    with nvtx.annotate("non-batched", color="yellow"):
-        for q, k, r in packed_qkr:
-            r.forward(positions, q, k)
-    torch.cuda.synchronize()
-    with nvtx.annotate("batched", color="green"):
-        batched_rope.forward(positions, query, key, flatten_offsets)
-    torch.cuda.synchronize()
+        positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
+        query = torch.randn(
+            (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
+        )
+        key = torch.randn_like(query)
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_native(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch.ops.vllm.flashinfer_rotary_embedding(
+                    positions,
+                    query.clone(),
+                    key.clone(),
+                    head_size,
+                    cos_sin_cache,
+                    is_neox_style,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
 
 
 if __name__ == "__main__":
@@ -116,17 +95,12 @@ if __name__ == "__main__":
     parser.add_argument(
         "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
     )
+    parser.add_argument("--save-path", type=str, default="./configs/rope/")
     args = parser.parse_args()
-    print(args)
 
-    benchmark_rope_kernels_multi_lora(
-        is_neox_style=args.is_neox_style,
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        num_heads=args.num_heads,
-        head_size=args.head_size,
-        rotary_dim=args.rotary_dim,
-        dtype=getattr(torch, args.dtype),
-        seed=args.seed,
-        device=args.device,
+    # Get the benchmark function
+    benchmark = get_benchmark(
+        args.head_size, args.rotary_dim, args.is_neox_style, args.device
     )
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 18c459c31d3f8..3e23c4cac059c 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -78,11 +78,11 @@ WEIGHT_SHAPES = {
 }
 
 WEIGHT_SHAPES_MOE = {
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [
         [8, 2, 4096, 28672],
         [8, 2, 14336, 4096],
     ],
-    "nm-testing/deepseekv2-lite": [
+    "deepseek-ai/DeepSeek-V2-Lite": [
         [64, 6, 2048, 1408],
     ],
     "ibm-granite/granite-3.0-1b-a400m": [
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 192d349b30099..dbda19fbcbf20 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -343,7 +343,7 @@ message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
 # Define extension targets
 #
 
-define_gpu_extension_target(
+define_extension_target(
     _C
     DESTINATION vllm
     LANGUAGE CXX
@@ -354,4 +354,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
\ No newline at end of file
+message(STATUS "Enabling C extension.")
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index f661084ec48ae..2cf3c1a755d3c 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -92,7 +92,7 @@ if(FLASH_MLA_ARCHS)
         SRCS "${FlashMLA_Extension_SOURCES}"
         CUDA_ARCHS "${FLASH_MLA_ARCHS}")
 
-    define_gpu_extension_target(
+    define_extension_target(
         _flashmla_C
         DESTINATION vllm
         LANGUAGE ${VLLM_GPU_LANG}
@@ -109,7 +109,7 @@ if(FLASH_MLA_ARCHS)
         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 
-    define_gpu_extension_target(
+    define_extension_target(
         _flashmla_extension_C
         DESTINATION vllm
         LANGUAGE ${VLLM_GPU_LANG}
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 931090db50e92..29db9fa273a41 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
+          GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index c2181d4549236..ca0062ba4fabe 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -453,21 +453,20 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 endmacro()
 
 #
-# Define a target named `GPU_MOD_NAME` for a single extension. The
+# Define a target named `MOD_NAME` for a single extension. The
 # arguments are:
 #
 # DESTINATION <dest>         - Module destination directory.
-# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
-#                              etc.
+# LANGUAGE <lang>            - The language for this module, e.g. CUDA, HIP,
+#                              CXX, etc.
 # SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
 #
 # Optional arguments:
 #
-# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
-#                              format.
-#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
-#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+# ARCHITECTURES <arches>     - A list of target architectures in cmake format.
+#                              For GPU, refer to CMAKE_CUDA_ARCHITECTURES and
+#                              CMAKE_HIP_ARCHITECTURES for more info.
 #                              ARCHITECTURES will use cmake's defaults if
 #                              not provided.
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
@@ -478,63 +477,61 @@ endmacro()
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
-function (define_gpu_extension_target GPU_MOD_NAME)
+function (define_extension_target MOD_NAME)
   cmake_parse_arguments(PARSE_ARGV 1
-    GPU
+    ARG
     "WITH_SOABI"
     "DESTINATION;LANGUAGE;USE_SABI"
     "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
-  if (GPU_LANGUAGE STREQUAL "HIP")
-    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  if (ARG_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}")
   endif()
 
-  if (GPU_WITH_SOABI)
-    set(GPU_WITH_SOABI WITH_SOABI)
+  if (ARG_WITH_SOABI)
+    set(SOABI_KEYWORD WITH_SOABI)
   else()
-    set(GPU_WITH_SOABI)
+    set(SOABI_KEYWORD "")
   endif()
 
-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  if (ARG_USE_SABI)
+    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
   else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
   endif()
 
-  if (GPU_LANGUAGE STREQUAL "HIP")
+  if (ARG_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
     # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
-    target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
+      ${ARG_INCLUDE_DIRECTORIES})
   else()
-    target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE csrc
+      ${ARG_INCLUDE_DIRECTORIES})
   endif()
 
-  if (GPU_ARCHITECTURES)
-    set_target_properties(${GPU_MOD_NAME} PROPERTIES
-      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  if (ARG_ARCHITECTURES)
+    set_target_properties(${MOD_NAME} PROPERTIES
+      ${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}")
   endif()
 
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${ARG_LANGUAGE}>:${ARG_COMPILE_FLAGS}>)
 
-  target_compile_options(${GPU_MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
 
-  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
-
-
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+  target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES})
 
   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
   # dependencies that are not necessary and may not be installed.
-  if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+  if (ARG_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES})
   else()
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES})
   endif()
 
-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME})
 endfunction()
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 6bee9e4ce1166..229d9862fb670 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -46,6 +46,32 @@ __global__ void merge_attn_states_kernel(
   s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
 
   const float max_lse = fmaxf(p_lse, s_lse);
+
+  /* In certain edge cases, MLA can produce p_lse = s_lse = -inf;
+     continuing the pipeline then yields NaN. Root cause: with chunked prefill
+     a batch may be split into two chunks; if a request in that batch has no
+     prefix hit, every LSE entry for that request’s position is -inf, and at
+     this moment we merge cross-attention at first. For now we simply emit
+     prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix
+     this problem.
+  */
+  if (std::isinf(max_lse)) {
+    if (pack_offset < head_size) {
+      // Pack 128b load
+      pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+          prefix_head_ptr)[pack_offset / pack_size];
+
+      // Pack 128b storage
+      reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+          p_out_pack;
+    }
+    // We only need to write to output_lse once per head.
+    if (output_lse != nullptr && pack_idx == 0) {
+      output_lse[head_idx * num_tokens + token_idx] = max_lse;
+    }
+    return;
+  }
+
   p_lse = p_lse - max_lse;
   s_lse = s_lse - max_lse;
   const float p_se = expf(p_lse);
diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index 13c6178941cf8..7d22dd8b84a39 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -24,6 +24,8 @@ struct SSMParamsBase {
     int64_t pad_slot_id;
 
     bool delta_softplus;
+    bool cache_enabled;
+    int block_size;
 
     index_t A_d_stride;
     index_t A_dstate_stride;
@@ -46,8 +48,9 @@ struct SSMParamsBase {
     index_t out_z_batch_stride;
     index_t out_z_d_stride;
     index_t ssm_states_batch_stride;
-    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dim_stride;
     index_t ssm_states_dstate_stride;
+    index_t cache_indices_stride;
 
     // Common data pointers.
     void *__restrict__ A_ptr;
@@ -66,6 +69,9 @@ struct SSMParamsBase {
     void *__restrict__ cache_indices_ptr;
     void *__restrict__ has_initial_state_ptr;
 
+    void *__restrict__ block_idx_first_scheduled_token_ptr;  // (batch,) - first block to write
+    void *__restrict__ block_idx_last_scheduled_token_ptr;   // (batch,) - last block to write
+    void *__restrict__ initial_state_idx_ptr;  // (batch,) - index of the initial state to use
 };
 
 
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index d534e138d26d6..fb2a2e5789999 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -119,7 +119,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
 
     const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
         : reinterpret_cast<int *>(params.cache_indices_ptr);
-    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; 
     // cache_index == params.pad_slot_id is defined as padding, so we exit early
     if (cache_index == params.pad_slot_id){
         return;
@@ -133,9 +133,18 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
-    cache_index * params.ssm_states_batch_stride + 
-    dim_id * kNRows * params.ssm_states_dim_stride;
+
+    typename Ktraits::state_t *ssm_states;
+    if (params.cache_enabled) {
+        // APC mode: ssm_states points to the base, we'll use absolute cache slots later
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    } else {
+        // Non-APC mode: offset by cache_index as before
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            cache_index * params.ssm_states_batch_stride +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    }
     
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
@@ -159,7 +168,22 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     // }
 
     constexpr int kChunkSize = kNThreads * kNItems;
-    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+
+    // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
+    const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
+    const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
+
+    const int* batch_cache_indices = cache_indices != nullptr ?
+                                     cache_indices + batch_id * params.cache_indices_stride : nullptr;
+    const int* block_idx_first_scheduled = params.block_idx_first_scheduled_token_ptr != nullptr ?
+                                           reinterpret_cast<const int*>(params.block_idx_first_scheduled_token_ptr) : nullptr;
+    const int* block_idx_last_scheduled = params.block_idx_last_scheduled_token_ptr != nullptr ?
+                                          reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
+    const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
+                                   reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
+
+    const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
+
     for (int chunk = 0; chunk < n_chunks; ++chunk) {
         input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
 
@@ -219,7 +243,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             if constexpr (kIsVariableC) {
                 auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                 load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
                 if constexpr (!kIsVariableB) {
                     #pragma unroll
                     for (int r = 0; r < kNRows; ++r) {
@@ -242,7 +266,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 for (int i = 0; i < kNItems; ++i) {
                     thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                  !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    
                     if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
                         if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
                             thread_data[i] = make_float2(1.f, 0.f);
@@ -250,8 +273,24 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                     }
                 }
                 // Initialize running total
-
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
+                scan_t running_prefix;
+                if (chunk > 0) {
+                    running_prefix = smem_running_prefix[state_idx + r * MAX_DSTATE];
+                } else {
+                    // Load initial state
+                    if (params.cache_enabled && has_initial_state && batch_cache_indices != nullptr) {
+                        size_t state_offset = load_cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+                        running_prefix = make_float2(1.0, float(ssm_states[state_offset]));
+                    } else if (has_initial_state) {
+                        // Non-APC mode: load from current batch position
+                        running_prefix = make_float2(1.0, float(ssm_states[state_idx * params.ssm_states_dstate_stride]));
+                    } else {
+                        // No initial state
+                        running_prefix = make_float2(1.0, 0.0);
+                    }
+                }
 
                 SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                 typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -260,8 +299,25 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 // There's a syncthreads in the scan op, so we don't need to sync here.
                 // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
                 if (threadIdx.x == 0) {
-                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
-                    if (chunk == n_chunks - 1) {
+                    smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
+
+                    // Store state at the end of each chunk when cache is enabled
+                    if (params.cache_enabled && batch_cache_indices != nullptr) {
+
+                        size_t cache_slot;
+                        if (chunk == n_chunks - 1) {
+                            cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
+                        } else {
+                            cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
+                        }
+
+                        size_t state_offset = cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+
+                        ssm_states[state_offset] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                    } else if (!params.cache_enabled && chunk == n_chunks - 1) {
+                        // Non-APC mode: store only final state at current batch position
                         ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
@@ -274,7 +330,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 }
             }
         }
-        
         input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
             + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
         __syncthreads();
@@ -346,7 +401,9 @@ template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
-        if (params.seqlen <= 128) {           
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 128) {
             selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
@@ -358,7 +415,9 @@ void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
             selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
-        if (params.seqlen <= 256) {
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
             selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
@@ -437,13 +496,17 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         const std::optional<at::Tensor>& D,
                         const std::optional<at::Tensor>& delta_bias,
                         const torch::Tensor ssm_states,
-                        bool has_z, 
+                        bool has_z,
                         bool delta_softplus,
                         const std::optional<at::Tensor>& query_start_loc,
                         const std::optional<at::Tensor>& cache_indices,
                         const std::optional<at::Tensor>& has_initial_state,
                         bool varlen,
-                        int64_t pad_slot_id) {
+                        int64_t pad_slot_id,
+                        int64_t block_size,
+                        const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                        const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                        const std::optional<torch::Tensor> &initial_state_idx) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -477,6 +540,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
     params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
     params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
 
+    // Set cache parameters - cache is enabled if we have direct cache writing params
+    params.cache_enabled = block_idx_first_scheduled_token.has_value();
+    params.block_size = static_cast<int>(block_size);
+
+    // Set direct cache writing pointers
+    params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
+    params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
+    params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;
 
     // All stride are in elements, not bytes.
     params.A_d_stride = A.stride(0);
@@ -504,9 +575,11 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(0);
 
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
 
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
+
     }
     else{
         if (!is_variable_B) {
@@ -537,8 +610,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(1);
         
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
+
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
     }
 }
 
@@ -554,7 +629,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   const torch::Tensor &ssm_states,
                   // used to identify padding entries if cache_indices provided
                   // in case of padding, the kernel will return early
-                  int64_t pad_slot_id) {
+                  int64_t pad_slot_id,
+                  int64_t block_size,
+                  const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                  const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                  const std::optional<torch::Tensor> &initial_state_idx) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -646,7 +725,16 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
         auto cache_indices_ = cache_indices.value();
         TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
         TORCH_CHECK(cache_indices_.is_cuda());
-        CHECK_SHAPE(cache_indices_, batch_size);
+
+        // cache_indices can be either 1D (batch_size,) for non-APC mode
+        // or 2D (batch_size, max_positions) for APC mode
+        const bool is_apc_mode = block_idx_first_scheduled_token.has_value();
+        if (is_apc_mode) {
+            TORCH_CHECK(cache_indices_.dim() == 2, "cache_indices must be 2D for APC mode");
+            TORCH_CHECK(cache_indices_.size(0) == batch_size, "cache_indices first dimension must match batch_size");
+        } else {
+            CHECK_SHAPE(cache_indices_, batch_size);
+        }
     }
    
 
@@ -686,7 +774,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                        cache_indices,
                        has_initial_state,
                        varlen,
-                       pad_slot_id
+                       pad_slot_id,
+                       block_size,
+                       block_idx_first_scheduled_token,
+                       block_idx_last_scheduled_token,
+                       initial_state_idx
                        );
 
     
diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
index 1d06fc6b5b0a0..df47bb8dd1d7d 100644
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -87,30 +87,23 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
   const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
   const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
 
-  // Per-expert outputs filled in parallel
-  std::vector<torch::Tensor> y_list(E);
-  y_list.resize(E);
+  auto X_all = x_c.index_select(/*dim=*/0, expert_tokens);
+  if (apply_router_weight_on_input) {
+    X_all = X_all.mul(expert_gates.unsqueeze(1));
+  }
+  auto Y_all = at::empty({offsets[E], H}, x_c.options());
 
   at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+    c10::InferenceMode guard;
     for (int64_t e = e_begin; e < e_end; ++e) {
       const int64_t te = counts[e];
       if (te == 0) {
-        y_list[e] = at::empty({0, H}, x_c.options());
         continue;
       }
 
       const int64_t start = offsets[e];
 
-      auto sel_tokens =
-          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-      auto gates_e =
-          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-
-      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
-
-      if (apply_router_weight_on_input) {
-        x_e = x_e.mul(gates_e.unsqueeze(1));
-      }
+      auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
 
       auto w13_e = w13_packed.select(/*dim=*/0, e);
       auto w2_e = w2_packed.select(/*dim=*/0, e);
@@ -137,17 +130,15 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
       // W2
       auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
 
-      if (!apply_router_weight_on_input) {
-        y = y.mul(gates_e.unsqueeze(1));
-      }
-
       // Store per-expert result
-      y_list[e] = y;
+      Y_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te).copy_(y);
     }
   });
 
-  // Concatenate all expert outputs to match expert_tokens order
-  auto Y_all = at::cat(y_list, /*dim=*/0);
+  if (!apply_router_weight_on_input) {
+    Y_all = Y_all.mul(expert_gates.unsqueeze(1));
+  }
+
   auto out = at::zeros({T, H}, x.options());
   out =
       at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index c93f9d54d780c..69b4c1fb11d1a 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -427,11 +427,29 @@ __device__ inline bool is_finite(const T val) {
 #endif
 }
 
+// Scoring function enums
+enum ScoringFunc {
+  SCORING_NONE = 0,    // no activation function
+  SCORING_SIGMOID = 1  // apply sigmoid
+};
+
+// Efficient sigmoid approximation from TensorRT-LLM
+__device__ inline float sigmoid_accurate(float x) {
+  return 0.5f * tanhf(0.5f * x) + 0.5f;
+}
+
 template <typename T>
-__device__ void topk_with_k2(T* output, T const* input,
+__device__ inline T apply_sigmoid(T val) {
+  float f = cuda_cast<float, T>(val);
+  return cuda_cast<T, float>(sigmoid_accurate(f));
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input, T const* bias,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
-                             int const num_experts_per_group) {
+                             int const num_experts_per_group,
+                             int const scoring_func) {
   // Get the top2 per thread
   T largest = neg_inf<T>();
   T second_largest = neg_inf<T>();
@@ -439,6 +457,12 @@ __device__ void topk_with_k2(T* output, T const* input,
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
       T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+
       if (value > largest) {
         second_largest = largest;
         largest = value;
@@ -448,7 +472,13 @@ __device__ void topk_with_k2(T* output, T const* input,
     }
   } else {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      largest = input[i];
+      T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+      largest = value;
     }
   }
 
@@ -472,17 +502,21 @@ __device__ void topk_with_k2(T* output, T const* input,
 }
 
 template <typename T>
-__global__ void topk_with_k2_kernel(T* output, T* input,
+__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                     int64_t const num_tokens,
                                     int64_t const num_cases,
                                     int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
+                                    int64_t const num_experts_per_group,
+                                    int const scoring_func) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
 
   int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
   if (case_id < num_cases) {
     input += case_id * num_experts_per_group;
+    // bias is per expert group, offset to current group
+    int32_t group_id = case_id % n_group;
+    T const* group_bias = bias + group_id * num_experts_per_group;
     output += case_id;
 
     cg::thread_block block = cg::this_thread_block();
@@ -491,7 +525,8 @@ __global__ void topk_with_k2_kernel(T* output, T* input,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+    topk_with_k2(output, input, group_bias, tile, lane_id,
+                 num_experts_per_group, scoring_func);
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
@@ -500,16 +535,15 @@ __global__ void topk_with_k2_kernel(T* output, T* input,
 
 template <typename T, typename IdxT>
 __global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
-    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
+    T const* bias, int64_t const num_tokens, int64_t const n_group,
     int64_t const topk_group, int64_t const topk, int64_t const num_experts,
     int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor) {
+    double routed_scaling_factor, int scoring_func) {
   int32_t warp_id = threadIdx.x / WARP_SIZE;
   int32_t lane_id = threadIdx.x % WARP_SIZE;
   int32_t case_id =
       blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores_with_bias += case_id * num_experts;
   scores += case_id * num_experts;
   group_scores += case_id * n_group;
   topk_values += case_id * topk;
@@ -577,10 +611,16 @@ __global__ void group_idx_and_topk_idx_kernel(
         int32_t offset = i_group * num_experts_per_group;
         for (int32_t i = lane_id; i < align_num_experts_per_group;
              i += WARP_SIZE) {
-          T candidates = (i < num_experts_per_group) &&
-                                 is_finite(scores_with_bias[offset + i])
-                             ? scores_with_bias[offset + i]
-                             : neg_inf<T>();
+          T candidates = neg_inf<T>();
+          if (i < num_experts_per_group) {
+            // Apply scoring function (if any) and add bias
+            T input = scores[offset + i];
+            if (is_finite(input)) {
+              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
+                                                          : input;
+              candidates = score + bias[offset + i];
+            }
+          }
           queue.add(candidates, offset + i);
         }
         if (group_scores[i_group] == topk_group_value) {
@@ -602,11 +642,12 @@ __global__ void group_idx_and_topk_idx_kernel(
     for (int i = lane_id;
          i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
          i += WARP_SIZE) {
-      T value =
-          i < topk
-              ? scores[s_topk_idx[i]]
-              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      T value = cuda_cast<T, float>(0.0f);
       if (i < topk) {
+        // Load the score value (without bias) for normalization
+        T input = scores[s_topk_idx[i]];
+        value =
+            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
         s_topk_value[i] = value;
       }
       topk_sum +=
@@ -627,12 +668,12 @@ __global__ void group_idx_and_topk_idx_kernel(
           value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
         }
         topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = cuda_cast<T, float>(value);
+        topk_values[i] = value;
       }
     } else {
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
         topk_indices[i] = i;
-        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+        topk_values[i] = 1.0f / topk;
       }
     }
     // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
@@ -644,12 +685,12 @@ __global__ void group_idx_and_topk_idx_kernel(
 }
 
 template <typename T, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
-                   IdxT* topk_indices, T* scores_with_bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, bool enable_pdl = false,
+void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
+                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
+                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk,
+                   bool const renormalize, double const routed_scaling_factor,
+                   int const scoring_func, bool enable_pdl = false,
                    cudaStream_t const stream = 0) {
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -664,8 +705,9 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
   attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group);
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group,
+                     scoring_func);
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -682,19 +724,18 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
   config.numAttrs = 1;
   config.attrs = attrs;
   cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, scores_with_bias, num_tokens,
-                     n_group, topk_group, topk, num_experts,
-                     num_experts / n_group, renormalize, routed_scaling_factor);
+                     topk_values, topk_indices, bias, num_tokens, n_group,
+                     topk_group, topk, num_experts, num_experts / n_group,
+                     renormalize, routed_scaling_factor, scoring_func);
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
   template void invokeNoAuxTc<T, IdxT>(                                     \
-      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
-      T * scores_with_bias, int64_t const num_tokens,                       \
-      int64_t const num_experts, int64_t const n_group,                     \
-      int64_t const topk_group, int64_t const topk, bool const renormalize, \
-      double const routed_scaling_factor, bool enable_pdl,                  \
-      cudaStream_t const stream);
+      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
+      T const* bias, int64_t const num_tokens, int64_t const num_experts,   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,  \
+      bool const renormalize, double const routed_scaling_factor,           \
+      int const scoring_func, bool enable_pdl, cudaStream_t const stream);
 
 INSTANTIATE_NOAUX_TC(float, int32_t);
 INSTANTIATE_NOAUX_TC(half, int32_t);
@@ -703,28 +744,32 @@ INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
 }  // namespace vllm
 
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor) {
-  auto data_type = scores_with_bias.scalar_type();
-  auto input_size = scores_with_bias.sizes();
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func = 0) {
+  auto data_type = scores.scalar_type();
+  auto input_size = scores.sizes();
   int64_t num_tokens = input_size[0];
   int64_t num_experts = input_size[1];
-  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
   TORCH_CHECK(num_experts % n_group == 0,
               "num_experts should be divisible by n_group");
   TORCH_CHECK(n_group <= 32,
               "n_group should be smaller than or equal to 32 for now");
   TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
+                  scoring_func == vllm::moe::SCORING_SIGMOID,
+              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");
 
   torch::Tensor group_scores = torch::empty(
       {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  // Always output float32 for topk_values (eliminates Python-side conversion)
   torch::Tensor topk_values = torch::empty(
-      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
   torch::Tensor topk_indices = torch::empty(
       {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
 
-  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+  auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());
 
   switch (data_type) {
     case torch::kFloat16:
@@ -732,11 +777,11 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
       vllm::moe::invokeNoAuxTc<half, int32_t>(
           reinterpret_cast<half*>(scores.mutable_data_ptr()),
           reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,
           num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     case torch::kFloat32:
       // Handle Float32
@@ -745,20 +790,20 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
           reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
           reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,
           num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     case torch::kBFloat16:
       // Handle BFloat16
       vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
           reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
           reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
           reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
-          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
       break;
     default:
       // Handle other data types
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
index e76d1c3667853..360f1312cf579 100644
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@@ -28,11 +28,16 @@ __global__ void moe_lora_align_sum_kernel(
     int64_t block_size, int num_experts, int max_loras, size_t numel,
     int max_num_tokens_padded, int max_num_m_blocks,
     int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int topk_num, int32_t* total_tokens_post_pad) {
+    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* lora_ids) {
   const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
-  int lora_id = blockIdx.x;
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;
   token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
@@ -121,14 +126,13 @@ __global__ void moe_lora_align_sum_kernel(
   }
 }
 
-void moe_lora_align_block_size(torch::Tensor topk_ids,
-                               torch::Tensor token_lora_mapping,
-                               int64_t num_experts, int64_t block_size,
-                               int64_t max_loras, int64_t max_num_tokens_padded,
-                               int64_t max_num_m_blocks,
-                               torch::Tensor sorted_token_ids,
-                               torch::Tensor expert_ids,
-                               torch::Tensor num_tokens_post_pad) {
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids) {
   const int topk_num = topk_ids.size(1);
 
   TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
@@ -164,6 +168,7 @@ void moe_lora_align_block_size(torch::Tensor topk_ids,
             max_loras, topk_ids.numel(), max_num_tokens_padded,
             max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
             expert_ids.data_ptr<int32_t>(), topk_num,
-            num_tokens_post_pad.data_ptr<int32_t>());
+            num_tokens_post_pad.data_ptr<int32_t>(),
+            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
       });
 }
\ No newline at end of file
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index e4bf0aa99421b..11c6875f7f1d0 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -20,14 +20,13 @@ void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                   torch::Tensor expert_ids,
                                   torch::Tensor num_tokens_post_pad);
 
-void moe_lora_align_block_size(torch::Tensor topk_ids,
-                               torch::Tensor token_lora_mapping,
-                               int64_t num_experts, int64_t block_size,
-                               int64_t max_loras, int64_t max_num_tokens_padded,
-                               int64_t max_num_m_blocks,
-                               torch::Tensor sorted_token_ids,
-                               torch::Tensor expert_ids,
-                               torch::Tensor num_tokens_post_pad);
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor b_qweight, torch::Tensor b_scales,
@@ -40,9 +39,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_K, int64_t bit);
 
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor);
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func);
 #endif
 
 bool moe_permute_unpermute_supported();
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index c08a543908ef0..bd95ade40a083 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -44,7 +44,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     int max_num_m_blocks, "
       "                     Tensor !sorted_token_ids,"
       "                     Tensor !experts_ids,"
-      "                     Tensor !num_tokens_post_pad) -> () ");
+      "                     Tensor !num_tokens_post_pad,"
+      "                     Tensor !adapter_enabled,"
+      "                     Tensor !lora_ids) -> () ");
   m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
 
 #ifndef USE_ROCM
@@ -105,9 +107,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   // Apply grouped topk routing to select experts.
   m.def(
-      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "grouped_topk(Tensor scores, int n_group, int "
       "topk_group, int topk, bool renormalize, float "
-      "routed_scaling_factor) -> (Tensor, Tensor)");
+      "routed_scaling_factor, Tensor bias, int scoring_func) -> (Tensor, "
+      "Tensor)");
   m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index 0bed7492f6616..3f5cb799b774c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -321,17 +321,19 @@ void dynamic_per_token_scaled_fp8_quant(
     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
     std::optional<torch::Tensor> const& scale_ub);
 
-void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
-                        const torch::Tensor& A, const torch::Tensor& B,
-                        const torch::Tensor& C,
-                        const std::optional<torch::Tensor>& D_,
-                        const std::optional<torch::Tensor>& z_,
-                        const std::optional<torch::Tensor>& delta_bias_,
-                        bool delta_softplus,
-                        const std::optional<torch::Tensor>& query_start_loc,
-                        const std::optional<torch::Tensor>& cache_indices,
-                        const std::optional<torch::Tensor>& has_initial_state,
-                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
+void selective_scan_fwd(
+    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
+    const torch::Tensor& B, const torch::Tensor& C,
+    const std::optional<torch::Tensor>& D_,
+    const std::optional<torch::Tensor>& z_,
+    const std::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
+    const std::optional<torch::Tensor>& query_start_loc,
+    const std::optional<torch::Tensor>& cache_indices,
+    const std::optional<torch::Tensor>& has_initial_state,
+    const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size,
+    const std::optional<torch::Tensor>& block_idx_first_scheduled_token,
+    const std::optional<torch::Tensor>& block_idx_last_scheduled_token,
+    const std::optional<torch::Tensor>& initial_state_idx);
 
 torch::Tensor dynamic_4bit_int_moe_cpu(
     torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 6fcd246f63c50..2521b2797e2c2 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -578,11 +578,13 @@ void persistent_masked_m_silu_mul_quant(
 
   // This kernel currently only supports H % 128 == 0 and assumes a
   // fixed GROUP_SIZE of 128.
+  static constexpr int GROUP_SIZE = 128;
+
   TORCH_CHECK(input.dtype() == torch::kBFloat16);
   TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
               y_q.dtype() == torch::kFloat8_e4m3fnuz);
   TORCH_CHECK(y_s.dtype() == torch::kFloat32);
-  TORCH_CHECK(input.size(-1) % 256 == 0);
+  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
 
   using Idx_t = int64_t;
 
@@ -601,8 +603,6 @@ void persistent_masked_m_silu_mul_quant(
 
   Idx_t stride_counts_e = tokens_per_expert.stride(0);
 
-  static constexpr int GROUP_SIZE = 128;
-
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
@@ -628,21 +628,26 @@ void persistent_masked_m_silu_mul_quant(
 
   static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
 
+  int const NUM_GROUPS = H / GROUP_SIZE;
   if (!use_ue8m0) {
-    if (H >= 4096) {
+    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
+      /* 8 warps config */
       static constexpr int NUM_STAGES = 4;
       static constexpr int THREAD_COUNT = 256;
       KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
     } else {
+      /* 1 warp config */
       static constexpr int THREAD_COUNT = 32;
       KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
     }
   } else {
-    if (H >= 4096) {
+    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
+      /* 8 warps config */
       static constexpr int NUM_STAGES = 4;
       static constexpr int THREAD_COUNT = 256;
       KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
     } else {
+      /* 1 warp config */
       static constexpr int THREAD_COUNT = 32;
       KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
     }
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 5575ee8e4197e..6d69852bb4e4f 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -31,6 +31,13 @@
 
 namespace vllm {
 
+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return (x + y - 1) / y * y;
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +49,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  int sf_m = round_up<int>(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_int + col] = 0x00;
+    }
+  }
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
   // Input tensor row/col loops.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +82,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
               rowIdx, colIdx, numCols, SFout);
 
       out_pos =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
     }
   }
 }
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
index cf2cccc913f62..62aeb927ccdcb 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
@@ -1,6 +1,5 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm100_fp8_dispatch.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 namespace vllm {
 
@@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
+                                                      b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
+                                                       b_scales);
   }
 }
 
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index f876b7d9acd87..c950008b4139a 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -2,6 +2,7 @@
 
 #include "scaled_mm.cuh"
 #include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 /**
  * This file defines Gemm kernel configurations for SM100 (fp8) based on the
@@ -12,8 +13,88 @@ namespace vllm {
 
 using c3x::cutlass_gemm_caller;
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm100_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_default {
   // M in (256, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -22,12 +103,16 @@ struct sm100_fp8_config_default {
   using TileShape = Shape<_256, _128, _128>;
   using ClusterShape = Shape<_2, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_M256 {
   // M in (64, 256]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -36,44 +121,127 @@ struct sm100_fp8_config_M256 {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64_swap_ab {
+  // This config is for M in (16, 64] and K >= 4096
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_M64 {
-  // M in (16, 64]
+  // This config is for M = 64 and K < 4096 (do not enable swap AB in such case)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
   using TileShape = Shape<_64, _64, _128>;
   using ClusterShape = Shape<_1, _1, _1>;
+
   using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M16 {
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M16_swap_ab {
   // M in [1, 16]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+  using TileShape = Shape<_128, _32, _128>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
 };
 
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
           typename... EpilogueArgs>
 inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
                                             torch::Tensor const& a,
                                             torch::Tensor const& b,
+                                            torch::Tensor const& a_scales,
+                                            torch::Tensor const& b_scales,
                                             EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
@@ -81,55 +249,69 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
 
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM16 =
-      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
+                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16SwapAB =
+      typename sm100_fp8_config_M16_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64SwapAB =
+      typename sm100_fp8_config_M64_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
-      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+      typename sm100_fp8_config_M64<InType, OutType, EnableBias>::Cutlass3xGemm;
+
   using Cutlass3xGemmM256 =
-      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+      typename sm100_fp8_config_M256<InType, OutType,
+                                     EnableBias>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  uint32_t const k = a.size(1);
 
-  if (mp2 <= 16) {
+  if (m <= 16) {
     // m in [1, 16]
-    return cutlass_gemm_caller<Cutlass3xGemmM16>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM16SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
     // m in (16, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 256) {
+    if (m == 64 && k < 4096) {
+      // do not enable swap AB
+      return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64>(
+          out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+
+  } else if (m <= 256) {
     // m in (64, 256]
-    return cutlass_gemm_caller<Cutlass3xGemmM256>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM256>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (256, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   }
 }
 
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
+template <bool EnableBias, typename... EpilogueArgs>
 void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
                                           torch::Tensor const& a,
                                           torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales,
                                           EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   if (out.dtype() == torch::kBFloat16) {
     return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::bfloat16_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                           cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
     return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::half_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                           cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
diff --git a/csrc/quantization/w8a8/int8/scaled_quant.cu b/csrc/quantization/w8a8/int8/scaled_quant.cu
index 7fe9e96bfb017..be8ecfeacf8c0 100644
--- a/csrc/quantization/w8a8/int8/scaled_quant.cu
+++ b/csrc/quantization/w8a8/int8/scaled_quant.cu
@@ -1,5 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
 
@@ -275,6 +276,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
   dim3 const block(std::min(hidden_size, 256));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@@ -306,6 +308,7 @@ void dynamic_scaled_int8_quant(
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
   dim3 const block(std::min(hidden_size, 256));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8f091a429fbef..9c0f524dcab11 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -611,7 +611,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor? cache_indices,"
       "Tensor? has_initial_state,"
       "Tensor! ssm_states,"
-      "int pad_slot_id) -> ()");
+      "int pad_slot_id,"
+      "int block_size,"
+      "Tensor? block_idx_first_scheduled_token,"
+      "Tensor? block_idx_last_scheduled_token,"
+      "Tensor? initial_state_idx) -> ()");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   // Hadamard transforms
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 61ebf970fe960..964700e2a43ac 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -132,9 +132,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
-    uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
-    && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # cuda arch list used by torch
@@ -356,16 +354,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
-    uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
-    && uv pip install --system dist/*.whl --verbose \
+    uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.4.1 \
-    && uv pip install --system flashinfer-jit-cache==0.4.1 \
+    uv pip install --system flashinfer-cubin==0.5.2 \
+    && uv pip install --system flashinfer-jit-cache==0.5.2 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 6dfa56017838b..b88b9c4992200 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.4.1
+# release version: v0.5.2
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     echo "git clone flashinfer..." \
     && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
-    && git checkout v0.4.1\
+    && git checkout v0.5.2 \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
     && rm -rf build \
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index adb0879f20d47..06d229f315bdc 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -75,7 +75,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
     && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install lm-eval[api]==0.4.4 \
     && python3 -m pip install pytest-shard
 
 # -----------------------
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 7fd7598b8bd93..2d2068d3453fe 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \
 
 # Install development utilities
 RUN microdnf install -y \
-    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
+    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
     openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
     clang llvm-devel llvm-static clang-devel && \
@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     rustup default stable && \
     rustup show
 
-FROM python-install AS torch
-ARG TORCH_VERSION=2.7.0
-ENV export _GLIBCXX_USE_CXX11_ABI=1
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-
-WORKDIR /tmp
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone https://github.com/pytorch/pytorch.git && \
-    cd pytorch && \
-    git checkout v2.7.0 && \
-    git submodule sync && \
-    git submodule update --init --recursive && \
-    uv pip install cmake ninja && \
-    uv pip install -r requirements.txt && \
-    python setup.py bdist_wheel
-    
-
 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VERSION=2.7.0
-ARG TORCH_VISION_VERSION=v0.20.1
+ARG TORCH_VISION_VERSION=v0.23.0
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     git clone https://github.com/pytorch/vision.git && \
     cd vision && \
     git checkout $TORCH_VISION_VERSION && \
-    TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
-    uv pip install -v $TORCH_WHL_FILE && \
+    uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
     python setup.py bdist_wheel
 
 FROM python-install AS hf-xet-builder
@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
        sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
     fi && python setup.py bdist_wheel
-
-# Edit aws-lc-sys to support s390x
-FROM python-install AS aws-lc-sys-editor
-WORKDIR /tmp
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-ARG AWS_LC_VERSION=v0.30.0
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone --recursive https://github.com/aws/aws-lc-rs.git && \
-    cd aws-lc-rs && \
-    git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
-    git submodule sync && \
-    git submodule update --init --recursive && \
-    cd aws-lc-sys && \
-    sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
-    sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
-    sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
     
 # Build Outlines Core
 FROM python-install AS outlines-core-builder
@@ -226,17 +181,17 @@ WORKDIR /tmp
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-ARG OUTLINES_CORE_VERSION=0.2.10
+COPY requirements/common.txt /tmp/requirements/common.txt
+ARG OUTLINES_CORE_VERSION
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
     --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
+    OUTLINES_CORE_VERSION=${OUTLINES_CORE_VERSION:-$(grep -E '^outlines_core\s*==\s*[0-9.]+' /tmp/requirements/common.txt | grep -Eo '[0-9.]+')} && \
+    if [ -z "${OUTLINES_CORE_VERSION}" ]; then echo "ERROR: Could not determine outlines_core version"; exit 1; fi && \
     git clone https://github.com/dottxt-ai/outlines-core.git && \
     cd outlines-core && \
     git checkout tags/${OUTLINES_CORE_VERSION} && \
     sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
-    echo '[patch.crates-io]' >> Cargo.toml && \
-    echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
     uv pip install maturin && \
     python -m maturin build --release --out dist
 
@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
 
 # Set correct library path for torch and numactl
-ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
 ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
 ENV UV_LINK_MODE=copy
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
 ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
+ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
+ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
+ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
 
 COPY . /workspace/vllm
 WORKDIR /workspace/vllm
@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
-    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
     --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
      HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
-     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
      LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
      NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
      OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
-        $TORCH_WHL_FILE \
         $LLVM_WHL_FILE \
         $NUMBA_WHL_FILE \
         $OUTLINES_CORE_WHL_FILE \
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 49ea39cad5128..4e6ef8f5ca13c 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -54,7 +54,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
-    python3 setup.py install
+    pip install --no-build-isolation .
 
 CMD ["/bin/bash"]
 
@@ -64,9 +64,6 @@ FROM vllm-base AS vllm-openai
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip uninstall oneccl oneccl-devel -y
-
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 
@@ -74,4 +71,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip uninstall oneccl oneccl-devel -y
+
 ENTRYPOINT ["vllm", "serve"]
diff --git a/docs/README.md b/docs/README.md
index ae95717def4cd..0608794e7e650 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
diff --git a/docs/assets/design/debug_vllm_compile/design_diagram.png b/docs/assets/design/debug_vllm_compile/design_diagram.png
new file mode 100644
index 0000000000000..9dfd45ec1a315
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/design_diagram.png differ
diff --git a/docs/assets/design/debug_vllm_compile/dynamic_shapes.png b/docs/assets/design/debug_vllm_compile/dynamic_shapes.png
new file mode 100644
index 0000000000000..7a018cc79c664
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/dynamic_shapes.png differ
diff --git a/docs/assets/design/debug_vllm_compile/tlparse_inductor.png b/docs/assets/design/debug_vllm_compile/tlparse_inductor.png
new file mode 100644
index 0000000000000..cbef753e6dd69
Binary files /dev/null and b/docs/assets/design/debug_vllm_compile/tlparse_inductor.png differ
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 0dfc582c7f8a7..7ddd45799789c 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
 - [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
 - [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index fed286f4b6343..7941b1f49ee8b 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -39,7 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
 
 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Meta-Llama-3-70B
+    vllm serve meta-llama/Llama-3.1-8B-Instruct
 ```
 
 vllm bench command:
@@ -47,7 +47,7 @@ vllm bench command:
 ```bash
 vllm bench serve \
     --backend vllm \
-    --model meta-llama/Meta-Llama-3-70B \
+    --model meta-llama/Llama-3.1-8B-Instruct \
     --dataset-name sharegpt \
     --dataset-path sharegpt.json \
     --profile \
@@ -70,18 +70,21 @@ apt update
 apt install nsight-systems-cli
 ```
 
-### Example commands and usage
+!!! tip
+    When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
 
-When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+The Nsight Systems profiler can be launched with `nsys profile ...`, with a few recommended flags for vLLM: `--trace-fork-before-exec=true --cuda-graph-trace=node`.
+
+### Example commands and usage
 
 #### Offline Inference
 
-For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+For basic usage, you can just append the profiling command before any existing script you would run for offline inference.
 
 The following is an example using the `vllm bench latency` script:
 
 ```bash
-nsys profile -o report.nsys-rep \
+nsys profile  \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
 vllm bench latency \
@@ -95,40 +98,29 @@ vllm bench latency \
 
 #### OpenAI Server
 
-To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, but you will need to specify a few other arguments to enable dynamic capture similarly to the Torch Profiler:
 
 ```bash
 # server
-nsys profile -o report.nsys-rep \
+VLLM_TORCH_CUDA_PROFILE=1 \
+nsys profile \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
-    --delay 30 \
-    --duration 60 \
+    --capture-range=cudaProfilerApi \
+    --capture-range-end repeat \
     vllm serve meta-llama/Llama-3.1-8B-Instruct
 
 # client
 vllm bench serve \
     --backend vllm \
     --model meta-llama/Llama-3.1-8B-Instruct \
-    --num-prompts 1 \
-    --dataset-name random \
-    --random-input 1024 \
-    --random-output 512
+    --dataset-name sharegpt \
+    --dataset-path sharegpt.json \
+    --profile \
+    --num-prompts 2
 ```
 
-In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
-
-```bash
-nsys sessions list
-```
-
-to get the session id in the form of `profile-XXXXX`, then run:
-
-```bash
-nsys stop --session=profile-XXXXX
-```
-
-to manually kill the profiler and generate your `nsys-rep` report.
+With `--profile`, vLLM will capture a profile for each run of `vllm bench serve`. Once the server is killed, the profiles will all be saved.
 
 #### Analysis
 
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
index e5d44945ba725..1d9e3632593ad 100644
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -13,7 +13,7 @@ Before you begin, ensure that you have the following:
 - A running Kubernetes cluster
 - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
-- An S3 with the model which will be deployed
+- (Optional) An S3 bucket or other storage with the model weights, if using automatic model download
 
 ## Installing the chart
 
@@ -61,10 +61,16 @@ The following table describes configurable parameters of the chart in `values.ya
 | deploymentStrategy | object | {} | Deployment strategy configuration |
 | externalConfigs | list | [] | External configuration |
 | extraContainers | list | [] | Additional containers configuration |
-| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
-| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
-| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
-| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
+| extraInit | object | {"modelDownload":{"enabled":true},"initContainers":[],"pvcStorage":"1Gi"} | Additional configuration for init containers |
+| extraInit.modelDownload | object | {"enabled":true} | Model download functionality configuration |
+| extraInit.modelDownload.enabled | bool | true | Enable automatic model download job and wait container |
+| extraInit.modelDownload.image | object | {"repository":"amazon/aws-cli","tag":"2.6.4","pullPolicy":"IfNotPresent"} | Image for model download operations |
+| extraInit.modelDownload.waitContainer | object | {} | Wait container configuration (command, args, env) |
+| extraInit.modelDownload.downloadJob | object | {} | Download job configuration (command, args, env) |
+| extraInit.initContainers | list | [] | Custom init containers (appended after model download if enabled) |
+| extraInit.pvcStorage | string | "1Gi" | Storage size for the PVC |
+| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | (Optional) Path of the model on S3 |
+| extraInit.awsEc2MetadataDisabled | bool | true | (Optional) Disable AWS EC2 metadata service |
 | extraPorts | list | [] | Additional ports configuration |
 | gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
 | image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
@@ -98,3 +104,36 @@ The following table describes configurable parameters of the chart in `values.ya
 | serviceName | string | "" | Service name |
 | servicePort | int | 80 | Service port |
 | labels.environment | string | test | Environment name |
+
+## Configuration Examples
+
+### Using S3 Model Download (Default)
+
+```yaml
+extraInit:
+  modelDownload:
+    enabled: true
+  pvcStorage: "10Gi"
+  s3modelpath: "models/llama-7b"
+```
+
+### Using Custom Init Containers Only
+
+For use cases like llm-d where you need custom sidecars without model download:
+
+```yaml
+extraInit:
+  modelDownload:
+    enabled: false
+  initContainers:
+    - name: llm-d-routing-proxy
+      image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      imagePullPolicy: IfNotPresent
+      ports:
+        - containerPort: 8080
+          name: proxy
+      securityContext:
+        runAsUser: 1000
+      restartPolicy: Always
+  pvcStorage: "10Gi"
+```
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
new file mode 100644
index 0000000000000..3b454e851b54e
--- /dev/null
+++ b/docs/design/debug_vllm_compile.md
@@ -0,0 +1,239 @@
+# How to debug the vLLM-torch.compile integration
+
+TL;DR:
+
+- use tlparse to acquire torch.compile logs. Include these logs in bug reports and/or support asks.
+- The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
+
+| Online Flag | Offline Flag   |      Result |
+|----------|----------|-------------|
+| --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
+| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) |  Turn off CUDAGraphs only |
+| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+
+## vLLM-torch.compile overview
+
+To improve performance, vLLM leverages torch.compile and CUDAGraphs to speed things up.
+torch.compile generates optimized kernels for PyTorch code while CUDAGraphs eliminates overhead.
+Most notably, vLLM-compile is NOT torch.compile, it is a custom compiler built using internal PyTorch Compile APIs.
+
+![vLLM-compile diagram](../assets/design/debug_vllm_compile/design_diagram.png)
+
+- Given a model, we do a full graph capture via TorchDynamo that is dynamic on the batch size (number of tokens)
+- vLLM then optionally splits and/or specializes this graph and then uses TorchInductor to compile each graph into a compiled artifact.
+This step may use vLLM custom Inductor passes to further optimize the graph.
+- The compiled artifact is saved to vLLM's compile cache so that it can be loaded in the future.
+- vLLM applies CUDAGraphs to reduce CPU overheads.
+
+Things can go wrong in each of the four steps. When something does go wrong, please try to isolate the subsystem
+that went wrong -- this will allow you to turn off the minimal number of things to keep reliability
+goals while minimizing impact to performance and also helps us (vLLM) when you open a bug report.
+
+For more details on the design, please see the following resources:
+
+- [Introduction to vLLM-torch.compile blogpost](https://blog.vllm.ai/2025/08/20/torch-compile.html)
+- [vLLM-torch.compile integration design](https://docs.vllm.ai/en/latest/design/torch_compile.html)
+- [vLLM Office Hours #26](https://www.youtube.com/live/xLyxc7hxCJc?si=Xulo9pe53C6ywf0V&t=561)
+- [Talk at PyTorch Conference 2025](https://youtu.be/1wV1ESbGrVQ?si=s1GqymUfwiwOrDTg&t=725)
+
+## Use tlparse
+
+Use [tlparse](https://github.com/meta-pytorch/tlparse) to acquire torch.compile logs. These logs show all stages of the compilation process,
+including the fused kernels that torch.compile produces.
+If you can, we recommend sending these or pieces of these along with any bug reports --
+they are very helpful.
+
+Install tlparse:
+
+```sh
+pip install tlparse
+```
+
+Usage (offline inference)
+
+```sh
+TORCH_TRACE=~/trace_dir python my_script.py
+tlparse ~/trace_dir/<the_first_log_file>
+```
+
+Usage (serving)
+
+```sh
+TORCH_TRACE=~/trace_dir vllm serve
+# ctrl-c out of the server
+tlparse ~/trace_dir/<the_first_log_file>
+```
+
+The `tlparse` command outputs some HTML files (perhaps into e.g. `./tl_out/index.html`).
+Open it to see the logs. It'll look something like the following:
+
+![tlparse example](../assets/design/debug_vllm_compile/tlparse_inductor.png)
+
+## Turn off vLLM-torch.compile integration
+
+Pass `--enforce-eager` to turn off the vLLM-torch.compile integration and run entirely
+in eager mode. This includes turning off CUDAGraphs.
+
+```sh
+# Online
+vllm serve --enforce-eager
+```
+
+```py
+# Offline
+LLM(model, enforce_eager=True)
+```
+
+To turn off just torch.compile, pass `mode = NONE` to the compilation config.
+(`-O` is short for `--compilation_config`):
+
+```sh
+# Online
+vllm serve -O.mode=0
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CompilationMode
+LLM(model, compilation_config=CompilationConfig(mode=CompilationMode.NONE))
+```
+
+To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
+
+```sh
+# Online
+vllm serve -O.cudagraph_mode=NONE
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CUDAGraphMode
+LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
+```
+
+## Debugging TorchDynamo
+
+vLLM requires model code be capturable into a full graph via TorchDynamo (torch.compile's frontend).
+TorchDynamo does not support all of Python. It will error (in fullgraph mode) if it cannot support
+a feature (this is sometimes known as a graph break).
+
+If you encounter a graph break, please [open an issue to pytorch/pytorch](https://github.com/pytorch/pytorch) so the PyTorch devs can prioritize.
+Then, try your best to rewrite the code to avoid the graph break.
+For more information, see this [Dynamo guide](https://docs.pytorch.org/docs/stable/compile/programming_model.dynamo_core_concepts.html).
+
+## Debugging Dynamic Shape full graph capture
+
+vLLM requires that the model's forward pass be capturable into a full graph that is dynamic
+on the batch size (i.e. the number of tokens). It (by default) compiles this one graph into
+one artifact and uses this artifact for all batch sizes.
+
+If your code cannot be captured with Dynamic Shapes, you may see silent incorrectness,
+loud errors, or CUDA illegal memory accesses. For example, the following is not
+capturable into a single graph:
+
+```py
+if data.size[0] % 128 == 0:
+    foo(...)
+else:
+    bar(...)
+```
+
+This problem is easy to diagnose. Use tlparse and click on `compilation_metrics`:
+it will tell you symbolic constraints on the batch size. If there is any constraint
+that restricts the batch sizes, then we've got a problem.
+
+![Bad tlparse example](../assets/design/debug_vllm_compile/dynamic_shapes.png)
+
+To avoid this, please either:
+
+1. avoid branching on the number of tokens
+2. wrap the branching logic into a custom operator. TorchDynamo does not
+trace into custom operators.
+
+## Debugging TorchInductor
+
+TorchInductor takes a captured graph and then compiles it down to some Python code
+that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
+produce an incorrect triton kernel. This may manifest as silent incorrectness,
+CUDA illegal memory accesses, or loud errors.
+
+To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
+to the compilation config:
+
+```sh
+# online
+vllm serve -O.backend=eager
+```
+
+```py
+# offline
+LLM(compilation_config=CompilationConfig(backend='eager'))
+```
+
+If Inductor is at fault, [file a bug to PyTorch](https://github.com/pytorch/pytorch).
+If you're feeling adventurous, you can debug the triton kernels in the Inductor output code
+(that you can locate via using tlparse).
+
+![tlparse example](../assets/design/debug_vllm_compile/tlparse_inductor.png)
+
+You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output code.
+
+### Editable TorchInductor code
+
+You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
+or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+
+This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
+and print statements in the output code.
+
+## Debugging vLLM-compile cache
+
+vLLM built its own cache for torch.compile artifacts. The idea is that the artifacts
+can be compiled once and then reused after they have been compiled. This
+is a layer on top of [torch.compile's compiler cache](https://docs.pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html).
+
+While torch.compile's compiler cache is rock-stable, vLLM's compiler cache is unfortunately
+not always correct. You can disable it via setting `VLLM_DISABLE_COMPILE_CACHE=1`.
+
+You can also manually remove this cache.
+
+- Remove vLLM's compile cache with `rm -rf ~/.cache/vllm` (look at logs to see if the location changed)
+- Remove torch.compile's built-in caches with `rm -rf /tmp/torchinductor_$(whoami)`
+
+vLLM's cache is a mapping from cache key to a compiled artifact. vLLM computes
+the cache key via combining multiple factors (e.g. config flags and model name).
+If vLLM's compile cache is wrong, this usually means that a factor is missing.
+Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310)
+of how vLLM computes part of the cache key.
+
+## Debugging CUDAGraphs
+
+CUDAGraphs is a feature that allows one to:
+
+- Capture a callable that launches 1+ CUDA kernels into a CUDAGraph
+- Replay the CUDAGraph
+
+The captured CUDAGraph contains all of the memory used during the capture process.
+The replay of the CUDAGraph reads and writes to exactly the same regions of memory.
+
+This leads to some restrictions:
+
+1. In order to use CUDAGraphs on new data, you'll need to copy the data into a buffer
+that the CUDAGraph is reading from
+2. CUDAGraphs only capture CUDA kernels, they don't capture work done on CPU.
+
+vLLM uses the raw CUDAGraphs API, which is unsafe when used incorrectly.
+
+To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
+
+```sh
+# Online
+vllm serve -O.cudagraph_mode=NONE
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig, CUDAGraphMode
+LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
+```
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index da61d2a85e466..acf7fc245462c 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -254,7 +254,15 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
                 changes to the batch makeup.
             """
             raise NotImplementedError
-            
+
+        @classmethod
+        def validate_params(cls, sampling_params: SamplingParams):
+            """Validate sampling params for this logits processor.
+
+            Raise ValueError for invalid ones.
+            """
+            return None
+
     ```
 
 A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods:
@@ -279,6 +287,10 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
     * Use the `BatchUpdate` members to update logits processor internal state
     * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
 
+* `validate_params(cls, sampling_params: SamplingParams)`:
+    * Raise `ValueError` if `SamplingParams` has invalid arguments (especially custom arguments) used by logits processor.
+    * When request is sent to entrypoint, `validate_params()` will validate `SamplingParams` and refuse request with invalid arguments.
+
 ### `BatchUpdate` data structure
 
 The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`):
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 633e23eea33e2..ee224e6922fbd 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -97,7 +97,7 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 | trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
 | pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
 | iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
-| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_moe_impl]                                                                                                                                                                                           |
+| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
 | cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
 | naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
 
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 5a3ca2de82194..27edc4f89201d 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -27,6 +27,8 @@ With all these factors taken into consideration, usually we can guarantee that t
 
 A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
 
+By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md
index 74ed40835b4d4..7a650d0e79c23 100644
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@@ -4,6 +4,9 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
 
 Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
 
+!!! note
+    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
+
 ## Offline Custom Arguments
 
 Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`:
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index b8ad53863cd7a..52fcc44efacc5 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -18,6 +18,11 @@ In vLLM, logits processors operate at batch granularity. During a given engine s
 
 Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods:
 
+* `validate_params(cls, sampling_params: SamplingParams)`:
+    * Raise `ValueError` if `SamplingParams` has invalid arguments (especially custom arguments) used by logits processor.
+    * When request is sent to entrypoint, `validate_params()` will validate `SamplingParams` and refuse request with invalid arguments.
+    * **Note:** it's important to implement `validate_params()` to prevent invalid parameters for custom logits processor. Otherwise requests with invalid parameters can cause unexpected behaviour in custom logits processor.
+
 * `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
     * `vllm_config`: engine configuration data structure
     * `device`: hardware accelerator device info
@@ -103,6 +108,14 @@ The contrived example below implements a custom logits processor which consumes
     class DummyLogitsProcessor(LogitsProcessor):
         """Fake logit processor to support unit testing and examples"""
 
+        @classmethod
+        def validate_params(cls, params: SamplingParams):
+            target_token: int | None = params.extra_args and params.extra_args.get(
+                "target_token"
+            )
+            if target_token is not None and not isinstance(target_token, int):
+                raise ValueError(f"target_token value {target_token} is not int")
+
         def __init__(self, vllm_config: "VllmConfig", device: torch.device,
                     is_pin_memory: bool):
             self.req_info: dict[int, int] = {}
@@ -118,6 +131,7 @@ The contrived example below implements a custom logits processor which consumes
             # Process added requests.
             for index, params, _, _ in batch_update.added:
                 assert params is not None
+                self.validate_params(params)
                 if params.extra_args and (target_token :=
                                         params.extra_args.get("target_token")):
                     self.req_info[index] = target_token
@@ -157,6 +171,7 @@ The contrived example below implements a custom logits processor which consumes
             logits[rows, cols] = values_to_keep
 
             return logits
+
     ```
 
 In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor.
@@ -180,7 +195,13 @@ RequestLogitsProcessor = Union[
 
 While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above.
 
-You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
+You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.):
+
+* Override `AdapterLogitsProcessor.validate_params(cls,params)` to validate request's sampling parameters.
+
+* Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit.
+
+* Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
 
 ??? code "Example of Wrapping a Request-Level Logits Processor"
 
@@ -220,6 +241,16 @@ You can wrap the request-level logits processor by subclassing `AdapterLogitsPro
         """Example of wrapping a fake request-level logit processor to create a
         batch-level logits processor"""
 
+        @classmethod
+        def validate_params(cls, params: SamplingParams):
+            target_token: Any | None = params.extra_args and params.extra_args.get(
+                "target_token"
+            )
+            if target_token is not None and not isinstance(target_token, int):
+                raise ValueError(
+                    f"target_token value {target_token} is not int"
+                )
+
         def is_argmax_invariant(self) -> bool:
             return False
 
@@ -240,18 +271,11 @@ You can wrap the request-level logits processor by subclassing `AdapterLogitsPro
             Returns:
             `Callable` request logits processor, or None
             """
-            target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+            target_token: Any | None = params.extra_args and params.extra_args.get(
                 "target_token"
             )
             if target_token is None:
                 return None
-            if not isinstance(target_token, int):
-                logger.warning(
-                    "target_token value %s is not int; not applying logits"
-                    " processor to request.",
-                    target_token,
-                )
-                return None
             return DummyPerReqLogitsProcessor(target_token)
     ```
 
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index caf458c24497c..cde2ec165712b 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -509,8 +509,8 @@ Then, you can use the OpenAI client as follows:
     print("Chat completion output:", chat_response.choices[0].message.content)
 
     # Multi-image input inference
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index dc2b2315182a9..5f26c7cf182b9 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -2,7 +2,10 @@
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
-Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+!!! warning
+    `reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
 
 ## Supported Models
 
@@ -61,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th
     # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content:", reasoning_content)
+    print("reasoning:", reasoning)
     print("content:", content)
     ```
 
-The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+The `reasoning` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
 
 ## Streaming chat completions
 
-Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+Streaming chat completions are also supported for reasoning models. The `reasoning` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
 
 ??? console "Json"
 
@@ -88,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
                 "index": 0,
                 "delta": {
                     "role": "assistant",
-                    "reasoning_content": "is",
+                    "reasoning": "is",
                 },
                 "logprobs": null,
                 "finish_reason": null
@@ -97,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
     }
     ```
 
-OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+OpenAI Python client library does not officially support `reasoning` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning` attribute is present in the response. For example:
 
 ??? code
 
@@ -127,22 +130,22 @@ OpenAI Python client library does not officially support `reasoning_content` att
     )
 
     print("client: Start streaming chat completions...")
-    printed_reasoning_content = False
+    printed_reasoning = False
     printed_content = False
 
     for chunk in stream:
-        # Safely extract reasoning_content and content from delta,
+        # Safely extract reasoning and content from delta,
         # defaulting to None if attributes don't exist or are empty strings
-        reasoning_content = (
-            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
+        reasoning = (
+            getattr(chunk.choices[0].delta, "reasoning", None) or None
         )
         content = getattr(chunk.choices[0].delta, "content", None) or None
 
-        if reasoning_content is not None:
-            if not printed_reasoning_content:
-                printed_reasoning_content = True
-                print("reasoning_content:", end="", flush=True)
-            print(reasoning_content, end="", flush=True)
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
         elif content is not None:
             if not printed_content:
                 printed_content = True
@@ -151,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 
-The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning`.
 
 ??? code
 
@@ -192,7 +195,7 @@ The reasoning content is also available when both tool calling and the reasoning
     print(response)
     tool_call = response.choices[0].message.tool_calls[0].function
 
-    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"reasoning: {response.choices[0].message.reasoning}")
     print(f"Function called: {tool_call.name}")
     print(f"Arguments: {tool_call.arguments}")
     ```
@@ -219,12 +222,11 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
     # in --reasoning-parser.
-    @ReasoningParserManager.register_module(["example"])
     class ExampleParser(ReasoningParser):
         def __init__(self, tokenizer: AnyTokenizer):
             super().__init__(tokenizer)
 
-        def extract_reasoning_content_streaming(
+        def extract_reasoning_streaming(
             self,
             previous_text: str,
             current_text: str,
@@ -241,7 +243,7 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
             previously been parsed and extracted (see constructor)
             """
 
-        def extract_reasoning_content(
+        def extract_reasoning(
             self,
             model_output: str,
             request: ChatCompletionRequest | ResponsesRequest,
@@ -263,6 +265,12 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
             tuple[Optional[str], Optional[str]]
                 A tuple containing the reasoning content and the content.
             """
+    # Register the reasoning parser
+    ReasoningParserManager.register_lazy_module(
+        name="example",
+        module_path="vllm.reasoning.example_reasoning_parser",
+        class_name="ExampleParser",
+    )
     ```
 
 Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index e7dd9fee12d37..edcbaa7164479 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -13,6 +13,9 @@ Key benefits:
 !!! note
     This feature is only supported on CUDA platform.
 
+!!! note
+    For more information, see this [Blog Post](https://blog.vllm.ai/2025/10/26/sleep-mode.html).
+
 ## Sleep levels
 
 Level 1 sleep will offload the model weights and discard the KV cache. The content of KV cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the KV cache (while the model's buffers are kept in CPU, like rope scaling tensors). The content of both the model weights and KV cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed, e.g. RLHF weight update.
@@ -31,6 +34,7 @@ llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
 #### Python API
 
 ```python
+# Sleep level 1
 # Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
 llm.sleep(level=1)
 
@@ -38,6 +42,21 @@ llm.sleep(level=1)
 llm.wake_up()
 ```
 
+```python
+# Sleep level 2
+# Put the engine to sleep (level=2: discard both weights and KV cache)
+llm.sleep(level=2)
+
+# Reallocate weights memory only
+llm.wake_up(tags=["weights"])
+
+# Load weights in-place
+llm.collective_rpc("reload_weights")
+
+# Reallocate KV cache
+llm.wake_up(tags=["kv_cache"])
+```
+
 #### RLHF weight updates
 
 During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
@@ -69,10 +88,30 @@ VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
   --port 8000
 ```
 
+Below is an example of how to sleep and wake up a model in level 1.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=1'
+curl -X POST 'http://localhost:8000/wake_up'
+```
+
+And this is an example of how to sleep and wake up a model in level 2.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=2'
+# Reallocate weights memory only
+curl -X POST 'http://localhost:8000/wake_up?tags=weights'
+# Load weights in-place
+curl -X POST 'http://localhost:8000/collective_rpc' -H 'Content-Type: application/json' -d '{"method":"reload_weights"}'
+# Reallocate KV cache
+curl -X POST 'http://localhost:8000/wake_up?tags=kv_cache'
+```
+
 #### HTTP endpoints
 
 - `POST /sleep?level=1` — Put the model to sleep (`level=1`).
 - `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
+- `POST /collective_rpc` — Perform a collective remote procedure call (RPC).
 - `GET /is_sleeping` — Check if the model is sleeping.
 
 !!! note
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index ab72c7d97b7a4..6097500cac01f 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -130,6 +130,46 @@ matching n-grams in the prompt. For more information read [this thread.](https:/
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
+## Speculating using Suffix Decoding
+
+The following code configures vLLM to use speculative decoding where proposals are generated using Suffix Decoding ([technical report](https://arxiv.org/abs/2411.04975)).
+
+Like n-gram, Suffix Decoding can generate draft tokens by pattern-matching using the last `n` generated tokens. Unlike n-gram, Suffix Decoding (1) can pattern-match against both the prompt and previous generations, (2) uses frequency counts to propose the most likely continuations, and (3) speculates an adaptive number of tokens for each request at each iteration to get better acceptance rates.
+
+Suffix Decoding can achieve better performance for tasks with high repetition, such as code-editing, agentic loops (e.g. self-reflection, self-consistency), and RL rollouts.
+
+!!! tip "Install Arctic Inference"
+    Suffix Decoding requires [Arctic Inference](https://github.com/snowflakedb/ArcticInference). You can install it with `pip install arctic-inference`.
+
+!!! tip "Suffix Decoding Speculative Tokens"
+    Suffix Decoding will speculate a dynamic number of tokens for each request at each decoding step, so the `num_speculative_tokens` configuration specifies the *maximum* number of speculative tokens. It is suggested to use a high number such as `16` or `32` (default).
+
+??? code
+
+    ```python
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "suffix",
+            "num_speculative_tokens": 32,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
+
 ## Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 9e1da37ca962d..e38627c707884 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
             }
         },
     )
-    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("reasoning: ", completion.choices[0].message.reasoning)
     print("content: ", completion.choices[0].message.content)
     ```
 
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 7a1b30096a56d..7e6c69e717dba 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -407,7 +407,6 @@ Here is a summary of a plugin file:
     # the name list in register_module can be used
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
-    @ToolParserManager.register_module(["example"])
     class ExampleToolParser(ToolParser):
         def __init__(self, tokenizer: AnyTokenizer):
             super().__init__(tokenizer)
@@ -439,6 +438,12 @@ Here is a summary of a plugin file:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=text)
+    # register the tool parser to ToolParserManager
+    ToolParserManager.register_lazy_module(
+        name="example",
+        module_path="vllm.entrypoints.openai.tool_parsers.example",
+        class_name="ExampleToolParser",
+    )
 
     ```
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 747035d38e3b0..2369eaed1802e 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -94,7 +94,7 @@ Currently, there are no pre-built CPU wheels.
 ## Related runtime environment variables
 
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
diff --git a/docs/getting_started/installation/gpu.rocm.inc.md b/docs/getting_started/installation/gpu.rocm.inc.md
index f546e0f0e5052..c80ba9478f6be 100644
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -11,9 +11,10 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
-- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
+- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201), Ryzen AI MAX / AI 300 Series (gfx1151/1150)
 - ROCm 6.3 or above
     - MI350 requires ROCm 7.0 or above
+    - Ryzen AI MAX / AI 300 Series requires ROCm 7.0.2 or above
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ea89108f01fc2..ce1c5c53cf35a 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -3,6 +3,7 @@
 import importlib
 import logging
 import sys
+import traceback
 from argparse import SUPPRESS, HelpFormatter
 from pathlib import Path
 from typing import Literal
@@ -16,7 +17,30 @@ ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
 
 sys.path.insert(0, str(ROOT_DIR))
+
+
+# Mock custom op code
+class MockCustomOp:
+    @staticmethod
+    def register(name):
+        def decorator(cls):
+            return cls
+
+        return decorator
+
+
+noop = lambda *a, **k: None
 sys.modules["vllm._C"] = MagicMock()
+sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
+sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
+
+# Mock any version checks by reading from compiled CI requirements
+with open(ROOT_DIR / "requirements/test.txt") as f:
+    VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
+importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
+
+# Make torch.nn.Parameter safe to inherit from
+sys.modules["torch.nn"] = MagicMock(Parameter=object)
 
 
 class PydanticMagicMock(MagicMock):
@@ -31,20 +55,17 @@ class PydanticMagicMock(MagicMock):
         return core_schema.any_schema()
 
 
-def auto_mock(module, attr, max_mocks=50):
+def auto_mock(module, attr, max_mocks=100):
     """Function that automatically mocks missing modules during imports."""
     logger.info("Importing %s from %s", attr, module)
     for _ in range(max_mocks):
         try:
             # First treat attr as an attr, then as a submodule
-            with patch("importlib.metadata.version", return_value="0.0.0"):
-                return getattr(
-                    importlib.import_module(module),
-                    attr,
-                    importlib.import_module(f"{module}.{attr}"),
-                )
-        except importlib.metadata.PackageNotFoundError as e:
-            raise e
+            return getattr(
+                importlib.import_module(module),
+                attr,
+                importlib.import_module(f"{module}.{attr}"),
+            )
         except ModuleNotFoundError as e:
             logger.info("Mocking %s for argparse doc generation", e.name)
             sys.modules[e.name] = PydanticMagicMock(name=e.name)
@@ -139,10 +160,19 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
     Returns:
         FlexibleArgumentParser: A parser with markdown formatting for the class.
     """
-    parser = FlexibleArgumentParser(add_json_tip=False)
-    parser.formatter_class = MarkdownFormatter
-    with patch("vllm.config.DeviceConfig.__post_init__"):
-        _parser = add_cli_args(parser, **kwargs)
+    try:
+        parser = FlexibleArgumentParser(add_json_tip=False)
+        parser.formatter_class = MarkdownFormatter
+        with patch("vllm.config.DeviceConfig.__post_init__"):
+            _parser = add_cli_args(parser, **kwargs)
+    except ModuleNotFoundError as e:
+        # Auto-mock runtime imports
+        if tb_list := traceback.extract_tb(e.__traceback__):
+            path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
+            auto_mock(module=".".join(path.parent.parts), attr=path.stem)
+            return create_parser(add_cli_args, **kwargs)
+        else:
+            raise e
     # add_cli_args might be in-place so return parser if _parser is None
     return _parser or parser
 
@@ -184,3 +214,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         with open(doc_path, "w", encoding="utf-8") as f:
             f.write(super(type(parser), parser).format_help())
         logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
+
+
+if __name__ == "__main__":
+    on_startup("build", False)
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index fd25647dce54b..eed1b3fb4bc85 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -404,6 +404,8 @@ th {
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
 | `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
+| `PanguEmbeddedForCausalLM` |openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
+| `PanguUltraMoEForCausalLM` |openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
 | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
 | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
@@ -675,6 +677,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
+| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
@@ -758,6 +761,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
 | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
 
 ### Pooling Models
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 94e801376e531..1995045fd5562 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -316,6 +316,10 @@ Traceback (most recent call last):
 
 This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA.
 
+## CUDA error: the provided PTX was compiled with an unsupported toolchain
+
+If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index c47547cb0ea7a..8d8a9e0f50805 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -6,8 +6,6 @@
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
-To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
-
 ## Why vLLM V1?
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 52c2363c89874..c5ae35985c17c 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -11,7 +11,7 @@ python save_sharded_state.py \
     --model /path/to/load \
     --quantization deepspeedfp \
     --tensor-parallel-size 8 \
-    --output /path/to/save/sharded/modele
+    --output /path/to/save/sharded/model
 
 python load_sharded_state.py \
     --model /path/to/saved/sharded/model \
diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py
index 72e7ce24d7cc8..ce000872dc96e 100644
--- a/examples/offline_inference/logits_processor/custom.py
+++ b/examples/offline_inference/logits_processor/custom.py
@@ -33,6 +33,8 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 
+from typing import Any
+
 import torch
 
 from vllm import LLM, SamplingParams
@@ -48,6 +50,16 @@ from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: Any | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"target_token value {target_token} {type(target_token)} is not int"
+            )
+
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
@@ -57,14 +69,17 @@ class DummyLogitsProcessor(LogitsProcessor):
         return False
 
     def update_state(self, batch_update: BatchUpdate | None):
+        def extract_extra_arg(params: SamplingParams) -> int | None:
+            self.validate_params(params)
+            return params.extra_args and params.extra_args.get("target_token")
+
         process_dict_updates(
             self.req_info,
             batch_update,
             # This function returns the LP's per-request state based on the
             # request details, or None if this LP does not apply to the
             # request.
-            lambda params, _, __: params.extra_args
-            and (params.extra_args.get("target_token")),
+            lambda params, _, __: extract_extra_arg(params),
         )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py
index 87cd7473fa9f1..5763fff5410dd 100644
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
@@ -76,6 +76,14 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     """Example of wrapping a fake request-level logit processor to create a
     batch-level logits processor"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: Any | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(f"target_token value {target_token} is not int")
+
     def is_argmax_invariant(self) -> bool:
         return False
 
@@ -101,13 +109,6 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
         )
         if target_token is None:
             return None
-        if not isinstance(target_token, int):
-            logger.warning(
-                "target_token value %s is not int; not applying logits"
-                " processor to request.",
-                target_token,
-            )
-            return None
         return DummyPerReqLogitsProcessor(target_token)
 
 
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py
index 3bb82a786040b..acd2c47f230f0 100644
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@@ -77,6 +77,14 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     """Example of overriding the wrapper class `__init__()` in order to utilize
     info about the device type"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token = params.extra_args and params.extra_args.get("target_token")
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"`target_token` has to be an integer, got {target_token}."
+            )
+
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
@@ -113,13 +121,6 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
             is None
         ):
             return None
-        if not isinstance(target_token, int):
-            logger.warning(
-                "target_token value %s is not int; not applying logits"
-                " processor to request.",
-                target_token,
-            )
-            return None
         return DummyPerReqLogitsProcessor(target_token)
 
 
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index f5f6e28b5fd9b..3cdc3b245b72a 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -16,18 +16,18 @@ except ImportError:
 
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]
 
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index c1ea95f8d0644..371cf6309a678 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1242,6 +1242,32 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# PaddleOCR-VL
+def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+    prompts = [
+        (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1817,6 +1843,7 @@ model_example_map = {
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
     "ovis2_5": run_ovis2_5,
+    "paddleocr_vl": run_paddleocr_vl,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 5cb47c15038e8..d6e169548f15b 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -22,18 +22,18 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]
 
 
@@ -801,6 +801,27 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
+    prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -1312,6 +1333,7 @@ model_example_map = {
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
     "ovis2_5": load_ovis2_5,
+    "paddleocr_vl": load_paddleocr_vl,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "phi4_multimodal": load_phi4_multimodal,
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
index bfe81121d1fd4..4376aac488f05 100644
--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -19,3 +19,15 @@ This directory contains a Helm chart for deploying the vllm application. The cha
 - templates/pvc.yaml: Template for Persistent Volume Claims.
 - templates/secrets.yaml: Template for Kubernetes Secrets.
 - templates/service.yaml: Template for creating Services.
+
+## Running Tests
+
+This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
+
+```bash
+# Install plugin
+helm plugin install https://github.com/helm-unittest/helm-unittest
+
+# Run tests
+helm unittest .
+```
diff --git a/examples/online_serving/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
index a9690bad3c945..3226c1d79c428 100644
--- a/examples/online_serving/chart-helm/templates/_helpers.tpl
+++ b/examples/online_serving/chart-helm/templates/_helpers.tpl
@@ -123,9 +123,6 @@ runAsUser:
 {{-   end }}
 {{- end }}
 
-{{- define "chart.extraInitImage" -}}
-"amazon/aws-cli:2.6.4"
-{{- end }}
 
 {{- define "chart.extraInitEnv" -}}
 - name: S3_ENDPOINT_URL
@@ -148,11 +145,15 @@ runAsUser:
     secretKeyRef:
       name: {{ .Release.Name }}-secrets
       key: s3accesskey
+{{- if .Values.extraInit.s3modelpath }}
 - name: S3_PATH
   value: "{{ .Values.extraInit.s3modelpath }}"
+{{- end }}
+{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
 - name: AWS_EC2_METADATA_DISABLED
   value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
 {{- end }}
+{{- end }}
 
 {{/*
   Define chart labels
diff --git a/examples/online_serving/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
index 536983b587be2..a0a3c4b9ee523 100644
--- a/examples/online_serving/chart-helm/templates/deployment.yaml
+++ b/examples/online_serving/chart-helm/templates/deployment.yaml
@@ -72,16 +72,21 @@ spec:
         {{ toYaml . | nindent 8 }}
         {{- end }}
 
-      {{-   if .Values.extraInit  }}
+      {{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
       initContainers:
+      {{- if .Values.extraInit.modelDownload.enabled }}
       - name: wait-download-model
-        image: {{ include "chart.extraInitImage" . }}
-        command: 
-          - /bin/bash
+        image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+        imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+        command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
         args:
-          - -eucx
-          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
-        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
+        env:
+        {{- if .Values.extraInit.modelDownload.waitContainer.env }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
+        {{- else }}
+        {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- end }}
         resources:
           requests:
             cpu: 200m
@@ -93,6 +98,10 @@ spec:
         - name: {{ .Release.Name }}-storage
           mountPath: /data
       {{- end }}
+      {{- with .Values.extraInit.initContainers }}
+      {{- toYaml . | nindent 6 }}
+      {{- end }}
+      {{- end }}
       volumes:
         - name: {{ .Release.Name }}-storage
           persistentVolumeClaim:
diff --git a/examples/online_serving/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
index f9ea3541e78d2..98d313916ca48 100644
--- a/examples/online_serving/chart-helm/templates/job.yaml
+++ b/examples/online_serving/chart-helm/templates/job.yaml
@@ -1,4 +1,4 @@
-{{-   if .Values.extraInit  }}
+{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -12,13 +12,17 @@ spec:
    spec:
     containers:
     - name: job-download-model
-      image: {{ include "chart.extraInitImage" . }}
-      command: 
-        - /bin/bash
+      image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+      imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+      command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
       args:
-        - -eucx
-        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
-      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
+      env:
+      {{- if .Values.extraInit.modelDownload.downloadJob.env }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
+      {{- else }}
+      {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- end }}
       volumeMounts:
         - name: {{ .Release.Name }}-storage
           mountPath: /data
diff --git a/examples/online_serving/chart-helm/tests/deployment_test.yaml b/examples/online_serving/chart-helm/tests/deployment_test.yaml
new file mode 100644
index 0000000000000..9b7472cf0fd43
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/deployment_test.yaml
@@ -0,0 +1,135 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+tests:
+  - it: should create wait-download-model init container when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers: [ ]
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - isNotEmpty:
+          path: spec.template.spec.initContainers
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[0].imagePullPolicy
+          value: IfNotPresent
+
+  - it: should only create custom init containers when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 1
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[0].ports[0].containerPort
+          value: 8080
+
+  - it: should create both wait-download-model and custom init containers when both are enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 2
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[1].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[1].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[1].ports[0].containerPort
+          value: 8080
\ No newline at end of file
diff --git a/examples/online_serving/chart-helm/tests/job_test.yaml b/examples/online_serving/chart-helm/tests/job_test.yaml
new file mode 100644
index 0000000000000..25d40ff265132
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/job_test.yaml
@@ -0,0 +1,61 @@
+suite: test job
+templates:
+  - job.yaml
+tests:
+  - it: should create job when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: spec.template.spec.containers[0].name
+          value: job-download-model
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.restartPolicy
+          value: OnFailure
+
+  - it: should not create job when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "download" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
diff --git a/examples/online_serving/chart-helm/tests/pvc_test.yaml b/examples/online_serving/chart-helm/tests/pvc_test.yaml
new file mode 100644
index 0000000000000..2a8b37da7e8bd
--- /dev/null
+++ b/examples/online_serving/chart-helm/tests/pvc_test.yaml
@@ -0,0 +1,32 @@
+suite: test pvc
+templates:
+  - pvc.yaml
+tests:
+  # Test Case: PVC Created When extraInit Defined
+  - it: should create pvc when extraInit is defined
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: ["/bin/bash"]
+            args: ["-c", "wait"]
+          downloadJob:
+            command: ["/bin/bash"]
+            args: ["-c", "download"]
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.accessModes[0]
+          value: ReadWriteOnce
+      - equal:
+          path: spec.resources.requests.storage
+          value: 10Gi
\ No newline at end of file
diff --git a/examples/online_serving/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
index 812d54bde1397..0d0e0098bc194 100644
--- a/examples/online_serving/chart-helm/values.schema.json
+++ b/examples/online_serving/chart-helm/values.schema.json
@@ -136,6 +136,70 @@
         "extraInit": {
             "type": "object",
             "properties": {
+                "modelDownload": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "image": {
+                            "type": "object",
+                            "properties": {
+                                "repository": {
+                                    "type": "string"
+                                },
+                                "tag": {
+                                    "type": "string"
+                                },
+                                "pullPolicy": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": ["repository", "tag", "pullPolicy"]
+                        },
+                        "waitContainer": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        },
+                        "downloadJob": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        }
+                    },
+                    "required": ["enabled", "image", "waitContainer", "downloadJob"]
+                },
+                "initContainers": {
+                    "type": "array",
+                    "items": {"type": "object"}
+                },
                 "s3modelpath": {
                     "type": "string"
                 },
@@ -147,9 +211,9 @@
                 }
             },
             "required": [
-                "pvcStorage",
-                "s3modelpath",
-                "awsEc2MetadataDisabled"
+                "modelDownload",
+                "initContainers",
+                "pvcStorage"
             ]
         },
         "extraContainers": {
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 815f02a4bfd52..8c6c9ae8ea239 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -75,10 +75,65 @@ maxUnavailablePodDisruptionBudget: ""
 
 # -- Additional configuration for the init container
 extraInit:
-   # -- Path of the model on the s3 which hosts model weights and config files
+  # -- Model download functionality (optional)
+  modelDownload:
+    # -- Enable model download job and wait container
+    enabled: true
+    # -- Image configuration for model download operations
+    image:
+      # -- Image repository
+      repository: "amazon/aws-cli"
+      # -- Image tag
+      tag: "2.6.4"
+      # -- Image pull policy
+      pullPolicy: "IfNotPresent"
+    # -- Wait container configuration (init container that waits for model to be ready)
+    waitContainer:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the wait container
+      args:
+        - "-eucx"
+        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+    # -- Download job configuration (job that actually downloads the model)
+    downloadJob:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the download job
+      args:
+        - "-eucx"
+        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+
+  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
+  initContainers: []
+  # Example for llm-d sidecar:
+  # initContainers:
+  #   - name: llm-d-routing-proxy
+  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+  #     imagePullPolicy: IfNotPresent
+  #     ports:
+  #       - containerPort: 8080
+  #         name: proxy
+  #     securityContext:
+  #       runAsUser: 1000
+
+  # -- Path of the model on the s3 which hosts model weights and config files
   s3modelpath: "relative_s3_model_path/opt-125m"
-   # -- Storage size of the s3
+  # -- Storage size for the PVC
   pvcStorage: "1Gi"
+  # -- Disable AWS EC2 metadata service
   awsEc2MetadataDisabled: true
 
 # -- Additional containers configuration
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 9fa600ff458db..520cbca003aa5 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -112,8 +112,8 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
 
 # Multi-image input inference
 def run_multi_image(model: str, max_completion_tokens: int) -> None:
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
     chat_completion_from_url = client.chat.completions.create(
         messages=[
             {
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 4006d07f73b00..1dfc3084646dd 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example demonstrates how to use tool calling with reasoning models 
-like QwQ-32B. The reasoning_content will not be parsed by the tool 
+like QwQ-32B. The reasoning will not be parsed by the tool 
 calling process; only the final output will be parsed.
 
 To run this example, you need to start the vLLM server with both 
@@ -78,7 +78,7 @@ messages = [
 
 
 def extract_reasoning_and_calls(chunks: list):
-    reasoning_content = ""
+    reasoning = ""
     tool_call_idx = -1
     arguments = []
     function_names = []
@@ -97,9 +97,9 @@ def extract_reasoning_and_calls(chunks: list):
                 if tool_call.function.arguments:
                     arguments[tool_call_idx] += tool_call.function.arguments
         else:
-            if hasattr(chunk.choices[0].delta, "reasoning_content"):
-                reasoning_content += chunk.choices[0].delta.reasoning_content
-    return reasoning_content, arguments, function_names
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
 
 
 def main():
@@ -115,7 +115,7 @@ def main():
     tool_calls = client.chat.completions.create(
         messages=messages, model=model, tools=tools
     )
-    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
     print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
     print(
         f"function arguments: "
@@ -129,9 +129,9 @@ def main():
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
 
-    print(f"reasoning_content: {reasoning_content}")
+    print(f"reasoning: {reasoning}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
 
@@ -144,7 +144,7 @@ def main():
     )
 
     tool_call = tool_calls.choices[0].message.tool_calls[0].function
-    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
     print(f"function name: {tool_call.name}")
     print(f"function arguments: {tool_call.arguments}")
     print("----------Stream Generate With Named Function Calling--------------")
@@ -159,8 +159,8 @@ def main():
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
-    print(f"reasoning_content: {reasoning_content}")
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    print(f"reasoning: {reasoning}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
     print("\n\n")
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 932dbeb2e7a24..87043897b058c 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -38,10 +38,10 @@ def main():
     # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content for Round 1:", reasoning_content)
+    print("reasoning for Round 1:", reasoning)
     print("content for Round 1:", content)
 
     # Round 2
@@ -54,10 +54,10 @@ def main():
     )
     response = client.chat.completions.create(model=model, messages=messages)
 
-    reasoning_content = response.choices[0].message.reasoning_content
+    reasoning = response.choices[0].message.reasoning
     content = response.choices[0].message.content
 
-    print("reasoning_content for Round 2:", reasoning_content)
+    print("reasoning for Round 2:", reasoning)
     print("content for Round 2:", content)
 
 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 7d1ea37714599..8e262701b7201 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -20,7 +20,7 @@ in real-time as they are generated by the model. This is useful for scenarios
 where you want to display chat completions to the user as they are generated
 by the model.
 
-Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+Remember to check content and reasoning exist in `ChatCompletionChunk`,
 content may not exist leading to errors if you try to access it.
 """
 
@@ -47,22 +47,20 @@ def main():
     stream = client.chat.completions.create(model=model, messages=messages, stream=True)
 
     print("client: Start streaming chat completions...")
-    printed_reasoning_content = False
+    printed_reasoning = False
     printed_content = False
 
     for chunk in stream:
-        # Safely extract reasoning_content and content from delta,
+        # Safely extract reasoning and content from delta,
         # defaulting to None if attributes don't exist or are empty strings
-        reasoning_content = (
-            getattr(chunk.choices[0].delta, "reasoning_content", None) or None
-        )
+        reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
         content = getattr(chunk.choices[0].delta, "content", None) or None
 
-        if reasoning_content is not None:
-            if not printed_reasoning_content:
-                printed_reasoning_content = True
-                print("reasoning_content:", end="", flush=True)
-            print(reasoning_content, end="", flush=True)
+        if reasoning is not None:
+            if not printed_reasoning:
+                printed_reasoning = True
+                print("reasoning:", end="", flush=True)
+            print(reasoning, end="", flush=True)
         elif content is not None:
             if not printed_content:
                 printed_content = True
diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py
new file mode 100644
index 0000000000000..276010197b5ab
--- /dev/null
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled.
+Reasoning models can be used through the Responses API as seen here 
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
+      --structured-outputs-config.backend xgrammar \
+      --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def get_weather(latitude: float, longitude: float) -> str:
+    """
+    Mock function to simulate getting weather data.
+    In a real application, this would call an external weather API.
+    """
+    return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+
+tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get current temperature for provided coordinates in celsius.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {"type": "number"},
+                "longitude": {"type": "number"},
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }
+]
+
+input_messages = [
+    {"role": "user", "content": "What's the weather like in Paris today?"}
+]
+
+
+def main():
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+    response = client.responses.create(
+        model=model, input=input_messages, tools=tools, tool_choice="required"
+    )
+
+    for out in response.output:
+        if out.type == "function_call":
+            print("Function call:", out.name, out.arguments)
+            tool_call = out
+    args = json.loads(tool_call.arguments)
+    result = get_weather(args["latitude"], args["longitude"])
+
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+        tools=tools,
+    )
+    print(response_2.output_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh
index 522b9566212bb..0756d4b0ae556 100644
--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -83,6 +83,29 @@ else
     RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
 fi
 
+# Parse VLLM_HOST_IP from additional args if present.
+# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
+VLLM_HOST_IP=""
+for arg in "${ADDITIONAL_ARGS[@]}"; do
+    if [[ $arg == "-e" ]]; then
+        continue
+    fi
+    if [[ $arg == VLLM_HOST_IP=* ]]; then
+        VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
+        break
+    fi
+done
+
+# Build Ray IP environment variables if VLLM_HOST_IP is set.
+# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
+RAY_IP_VARS=()
+if [ -n "${VLLM_HOST_IP}" ]; then
+    RAY_IP_VARS=(
+        -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+        -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
+    )
+fi
+
 # Launch the container with the assembled parameters.
 # --network host: Allows Ray nodes to communicate directly via host networking
 # --shm-size 10.24g: Increases shared memory
@@ -95,5 +118,6 @@ docker run \
     --shm-size 10.24g \
     --gpus all \
     -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    "${RAY_IP_VARS[@]}" \
     "${ADDITIONAL_ARGS[@]}" \
     "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index 64c8a91782806..d60dbf4d753a7 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
         for chunk in response:
             delta = chunk.choices[0].delta
             # Stream reasoning first
-            if reason and hasattr(delta, "reasoning_content") and live_think:
-                rc = delta.reasoning_content
+            if reason and hasattr(delta, "reasoning") and live_think:
+                rc = delta.reasoning
                 if rc:
                     think_text += rc
                     live_think.markdown(think_text + "▌")
@@ -262,8 +262,8 @@ def server_supports_reasoning():
         messages=[{"role": "user", "content": "Hi"}],
         stream=False,
     )
-    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
-        resp.choices[0].message.reasoning_content
+    return hasattr(resp.choices[0].message, "reasoning") and bool(
+        resp.choices[0].message.reasoning
     )
 
 
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index 02853a95469a6..ff473d044e323 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -33,7 +33,7 @@ async def print_stream_response(
     async for chunk in stream_response:
         delta = chunk.choices[0].delta
 
-        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
         content_chunk_text = delta.content
 
         if args.reasoning:
@@ -255,8 +255,8 @@ async def cli():
         for constraint, response in zip(constraints, results):
             print(f"\n\n{constraint}:")
             message = response.choices[0].message
-            if args.reasoning and hasattr(message, "reasoning_content"):
-                print(f"  Reasoning: {message.reasoning_content or ''}")
+            if args.reasoning and hasattr(message, "reasoning"):
+                print(f"  Reasoning: {message.reasoning or ''}")
             print(f"  Content: {message.content!r}")
 
 
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 6f2be65a18af8..bf97093dafb11 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -142,8 +142,3 @@ extra_javascript:
   - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
   - mkdocs/javascript/edit_and_feedback.js
   - mkdocs/javascript/slack_and_forum.js
-
-# Makes the url format end in .html rather than act as a dir
-# So index.md generates as index.html and is available under URL /index.html
-# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
-use_directory_urls: false
diff --git a/pyproject.toml b/pyproject.toml
index 29ee7f75f070a..a250ab6567f12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "cmake>=3.26.1",
     "ninja",
     "packaging>=24.2",
-    "setuptools>=77.0.3,<80.0.0",
+    "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
     "torch == 2.9.0",
     "wheel",
diff --git a/requirements/build.txt b/requirements/build.txt
index ba09eaab70e8e..23ff8d4fdc1c0 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -2,7 +2,7 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 torch==2.9.0
 wheel
diff --git a/requirements/common.txt b/requirements/common.txt
index 724360f8bc9e4..90efb79a845d3 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,12 +19,12 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@@ -35,7 +35,7 @@ mistral_common[image,audio] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
-setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
 compressed-tensors == 0.12.2 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
@@ -49,3 +49,4 @@ cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
+model-hosting-container-standards < 1.0.0
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index bba7bc7a4d8c4..331d02be6621e 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,7 +1,7 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.8.0+cpu; platform_machine == "x86_64"
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index d53ab3649308a..605ce73bff9ce 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -5,9 +5,9 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 
 # Dependencies for CPUs
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.8.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 5f7d520cd3662..4e393d6b66152 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -12,4 +12,4 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor
 # Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
 xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.4.1
+flashinfer-python==0.5.2
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 00c314874016f..0fd6dbe22c512 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -9,12 +9,4 @@ mkdocs-git-revision-date-localized-plugin
 mkdocs-minify-plugin
 regex
 ruff
-
-# Required for argparse hook only
--f https://download.pytorch.org/whl/cpu
-cachetools
-cloudpickle
-py-cpuinfo
-msgspec
 pydantic
-torch
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 51f58e57a7851..56ec90c563c04 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -9,7 +9,7 @@ torchaudio==2.9.0
 triton==3.5.0
 cmake>=3.26.1,<4
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 541fa1e267cb0..432e11977872d 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -4,7 +4,7 @@ tblib==3.1.0
 bm25s==0.2.13
 pystemmer==3.0.0
 
-# entrypoints test
+# Entrypoints test
 # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 audioread==3.0.1
 cffi==1.17.1
@@ -17,11 +17,11 @@ soundfile==0.13.1
 soxr==0.5.0.post1
 librosa==0.10.2.post1
 
-# entrypoints test
+# Entrypoints test
 #vllm[video] # required by entrypoints/openai/test_video.py
 decord==0.6.0
 
-# entrypoints test
+# Entrypoints test
 #sentence-transformers # required by entrypoints/openai/test_score.py
 sentence-transformers==3.4.1
 
@@ -32,7 +32,10 @@ matplotlib==3.10.3
 blobfile==3.0.0
 
 # Required for openai schema test.
-schemathesis==3.39.15 
+schemathesis==3.39.15
 
-# required for mteb test
-mteb[bm25s]>=1.38.11, <2 
+# Required for mteb test
+mteb[bm25s]>=1.38.11, <2
+
+# Required for eval tests
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 6f1cca90e5e2b..f06e4248a7242 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -10,7 +10,7 @@ peft
 pytest-asyncio
 tensorizer==2.10.1
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
diff --git a/requirements/test.in b/requirements/test.in
index f57ec31277ce9..30d97e9b9c7d0 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -48,6 +48,7 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
+arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.0
diff --git a/requirements/test.txt b/requirements/test.txt
index a975f247065da..3263b74c08797 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -40,6 +40,8 @@ anyio==4.6.2.post1
     # via
     #   httpx
     #   starlette
+arctic-inference==0.1.1
+    # via -r requirements/test.in
 argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index e69a98b86036e..59ea710684a2c 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -5,7 +5,7 @@ ray>=2.9
 cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 4145e84c2ee0c..7455147f2b95a 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -214,28 +214,72 @@ def test_splitting_ops_dynamic():
         assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
-def test_resolve_operator_overload():
+def test_should_split():
     import torch
 
-    from vllm.compilation.partition_rules import resolve_defined_ops
+    from vllm.compilation.partition_rules import should_split
 
-    # Test valid operator names
-    resolved = resolve_defined_ops(["aten::mm.default", "aten::addmm.default"])
-    assert len(resolved) == 2
-    assert resolved[0] is torch.ops.aten.mm.default
-    assert resolved[1] is torch.ops.aten.addmm.default
-
-    # Test that invalid operators are skipped (not raising exceptions)
-    resolved = resolve_defined_ops(
-        [
-            "aten::mm.default",
-            "aten::nonexistent_op.default",  # This should be skipped
-            "aten::addmm.default",
-        ]
+    graph = torch.fx.Graph()
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.aten.add.default,
+        args=(),
+        kwargs={},
     )
-    assert len(resolved) == 2  # Only 2 valid ops
-    assert resolved[0] is torch.ops.aten.mm.default
-    assert resolved[1] is torch.ops.aten.addmm.default
+
+    # supports OpOverloadPacket
+    splitting_ops = ["aten::add"]
+    assert should_split(node, splitting_ops)
+
+    # supports OpOverload
+    splitting_ops = ["aten::add.default"]
+    assert should_split(node, splitting_ops)
+
+    # supports OpOverload
+    splitting_ops = ["aten::add.Tensor"]
+    assert not should_split(node, splitting_ops)
+
+    @torch.library.custom_op(
+        "silly::attention",
+        mutates_args=["out"],
+    )
+    def attention(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        out.copy_(q + k + v)
+
+    q, k, v, out = [torch.randn(1)] * 4
+
+    # supports custom ops as OpOverloadPacket
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.silly.attention,
+        args=(q, k, v, out),
+        kwargs={},
+    )
+
+    splitting_ops = ["silly::attention"]
+    assert should_split(node, splitting_ops)
+
+    # supports custom ops as OpOverload
+    node = torch.fx.Node(
+        graph=graph,
+        name="dummy_node",
+        op="call_function",
+        target=torch.ops.silly.attention.default,
+        args=(q, k, v, out),
+        kwargs={},
+    )
+
+    splitting_ops = ["silly::attention"]
+    assert should_split(node, splitting_ops)
+
+    splitting_ops = ["silly::attention.default"]
+    assert should_split(node, splitting_ops)
 
 
 @pytest.mark.skipif(
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 0ad8c17d86686..71f90f6d8d3ee 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -183,8 +183,14 @@ def test_custom_compile_config(
     "compilation_mode",
     [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
-def test_fp8_kv_scale_compile(compilation_mode: int):
-    model = "Qwen/Qwen2-0.5B"
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-0.5B",  # Standard attention model
+        "deepseek-ai/DeepSeek-V2-Lite",  # MLA (Multi-head Latent Attention) model
+    ],
+)
+def test_fp8_kv_scale_compile(compilation_mode: int, model: str):
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index d66c60ccb5b24..4b910bc285797 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -54,11 +54,11 @@ if current_platform.is_cuda():
 
     MODELS_FP4 = [
         ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=_Backend.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            attention_fusions=32,
+            allreduce_fusions=65,
         ),
     ]
 
@@ -71,6 +71,13 @@ if current_platform.is_cuda():
             attention_fusions=0,
             allreduce_fusions=65,
         ),
+        ModelBackendTestCase(
+            model_name="Qwen/Qwen3-30B-A3B",
+            model_kwargs=dict(max_model_len=1024),
+            backend=_Backend.TRITON_ATTN,
+            attention_fusions=0,
+            allreduce_fusions=97,
+        ),
     ]
 
 elif current_platform.is_rocm():
@@ -95,8 +102,7 @@ elif current_platform.is_rocm():
         ),
     ]
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_FP8 = ["-quant_fp8"]  # , "+quant_fp8"]
+CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
 
 
 @pytest.mark.parametrize(
@@ -171,8 +177,7 @@ def test_attn_quant(
     assert int(matches[0]) == attention_fusions
 
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_RMS_NORM = ["-rms_norm"]  # , "+rms_norm"]
+CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
 
 
 def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py
index 6c195dd93f423..b76c29819a2df 100644
--- a/tests/compile/test_multimodal_compile.py
+++ b/tests/compile/test_multimodal_compile.py
@@ -3,11 +3,20 @@
 import pytest
 
 from vllm.compilation.counter import compilation_counter
+from vllm.config import VllmConfig
 from vllm.config.compilation import CompilationMode
+from vllm.platforms import current_platform
+
+
+def test_compile():
+    vllm_config = VllmConfig()
+    # Default configuration compiles mm encoder
+    assert vllm_config.compilation_config.compile_mm_encoder
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
     """Test that Qwen2.5-VL vision submodules are compiled.
 
@@ -29,8 +38,33 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
         vllm_runner(
             "Qwen/Qwen2.5-VL-3B-Instruct",
             max_model_len=2048,
-            gpu_memory_utilization=0.7,
+            gpu_memory_utilization=0.8,
             compilation_config={"mode": CompilationMode.VLLM_COMPILE},
         ) as _,
     ):
         pass
+
+
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+@pytest.mark.forked
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
+    """Test that Qwen2.5-VL vision submodules are not compiled when the
+    config is passed off
+    """
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    with (
+        compilation_counter.expect(num_models_seen=1),
+        vllm_runner(
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            max_model_len=2048,
+            gpu_memory_utilization=0.8,
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": False,
+            },
+        ) as _,
+    ):
+        pass
diff --git a/tests/conftest.py b/tests/conftest.py
index 41fda04a6c92d..5e127e4e939e6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -154,26 +154,6 @@ AUDIO_ASSETS = AudioTestAssets()
 """Singleton instance of {class}`AudioTestAssets`."""
 
 
-@pytest.fixture(scope="function", autouse=True)
-def cleanup_VLLM_USE_V1(monkeypatch):
-    """
-    The V1 oracle sets "VLLM_USE_V1" during loading. This means
-    that each invocation of a test change the env variable.
-
-    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
-    made during the test run by vLLM will be cleaned up.
-
-    This fixture is used by every test.
-    """
-
-    # If VLLM_USE_V1 is not set, set then delete. This will
-    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
-    # if VLLM modifies the value of envs.VLLM_USE_V1.
-    if "VLLM_USE_V1" not in os.environ:
-        monkeypatch.setenv("VLLM_USE_V1", "")
-        monkeypatch.delenv("VLLM_USE_V1")
-
-
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 5495640af07eb..3576efca591cf 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -14,6 +14,7 @@ from dataclasses import dataclass
 from typing import Literal, NamedTuple
 
 import pytest
+import torch
 
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
@@ -30,6 +31,7 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
+    dcp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
@@ -52,6 +54,7 @@ class CPTestSettings:
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
+        dcp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
@@ -66,6 +69,7 @@ class CPTestSettings:
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
+                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -108,6 +112,7 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
+        dcp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -180,6 +185,8 @@ def _compare_cp_with_tp(
         str(pp_size),
         "--decode-context-parallel-size",
         str(dcp_size),
+        "--dcp-kv-cache-interleave-size",
+        str(dcp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -207,6 +214,7 @@ CP_TEXT_GENERATION_MODELS = {
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
+        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
@@ -247,6 +255,17 @@ def test_cp_generation(
     test_options: CPTestOptions,
     num_gpus_available,
 ):
+    if (
+        model_id == "deepseek-ai/DeepSeek-V2-Lite-Chat"
+        and torch.cuda.get_device_capability() < (9, 0)
+    ):
+        pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
+    if (
+        model_id == "bigcode/gpt_bigcode-santacoder"
+        and torch.cuda.get_device_capability() != (9, 0)
+    ):
+        pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
+
     _compare_cp_with_tp(
         model_id,
         parallel_setup,
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
new file mode 100644
index 0000000000000..11e23f128f331
--- /dev/null
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import lm_eval
+import pytest
+
+from tests.utils import large_gpu_mark
+
+
+def get_model_args(
+    model_name: str,
+    spec_model_name: str,
+    spec_method: str,
+    tp_size: int,
+    model_max_len: int,
+) -> dict:
+    speculative_config = {
+        "method": spec_method,
+        "model": spec_model_name,
+        "num_speculative_tokens": 1,
+        "max_model_len": model_max_len,
+    }
+
+    model_args = {
+        "pretrained": model_name,
+        "dtype": "auto",
+        "add_bos_token": True,
+        "tensor_parallel_size": tp_size,
+        "gpu_memory_utilization": 0.7,
+        "speculative_config": speculative_config,
+        "enable_expert_parallel": True,
+        "num_redundant_experts": tp_size,
+        "eplb_window_size": 128,
+        "eplb_step_interval": 1024,
+        "eplb_log_balancedness": False,
+        "enable_eplb": True,
+        "max_model_len": model_max_len,
+    }
+    return model_args
+
+
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        pytest.param(
+            ("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86),
+            marks=large_gpu_mark(min_gb=80),
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+                4,
+                0.92,
+            ),
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
+        ),
+    ],
+    ids=["qwen3_next_mtp", "llama4_eagle"],
+)
+def test_eplb_spec_decode(
+    monkeypatch: pytest.MonkeyPatch,
+    model_setup: tuple[str, str, str, int, float],
+):
+    """
+    Test the correctness of EPLB speculative decoding with GSM8K dataset.
+    Applicable to MoE models with mtp or eagle spec decode.
+    """
+    method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+
+    model_args = get_model_args(
+        model_name=model_name,
+        spec_model_name=spec_model_name,
+        spec_method=method,
+        tp_size=tp_size,
+        model_max_len=4096,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index b2a958a992a62..a9698632b82e0 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -6,6 +6,7 @@ import pytest
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.sampling_params import SamplingParams
 
 from ..openai.test_vision import TEST_IMAGE_ASSETS
 
@@ -23,6 +24,29 @@ def text_llm():
     cleanup_dist_env_and_memory()
 
 
+@pytest.fixture(scope="function")
+def llm_for_failure_test():
+    """
+    Fixture for testing issue #26081.
+    Uses a small max_model_len to easily trigger length errors.
+    """
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        seed=0,
+        max_model_len=128,
+        disable_log_stats=True,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
 def test_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -157,3 +181,32 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking):
     else:
         # The chat template includes dummy thinking process
         assert think_id in prompt_token_ids
+
+
+def test_chat_batch_failure_cleanup(llm_for_failure_test):
+    """
+    Tests that if a batch call to llm.chat() fails mid-way
+    (e.g., due to one invalid prompt), the requests that
+    were already enqueued are properly aborted and do not
+    pollute the queue for subsequent calls.
+    (Fixes Issue #26081)
+    """
+    llm = llm_for_failure_test
+    valid_msg = [{"role": "user", "content": "Hello"}]
+    long_text = "This is a very long text to test the error " * 50
+    invalid_msg = [{"role": "user", "content": long_text}]
+    batch_1 = [
+        valid_msg,
+        valid_msg,
+        invalid_msg,
+    ]
+    batch_2 = [
+        valid_msg,
+        valid_msg,
+    ]
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    with pytest.raises(ValueError, match="longer than the maximum model length"):
+        llm.chat(batch_1, sampling_params=sampling_params)
+    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
+    assert len(outputs_2) == len(batch_2)
+    assert llm.llm_engine.get_num_unfinished_requests() == 0
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index e452b578ba22b..7b3092b563030 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -80,7 +80,7 @@ FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
 
 
 def extract_reasoning_and_calls(chunks: list):
-    reasoning_content = ""
+    reasoning = ""
     tool_call_idx = -1
     arguments = []
     function_names = []
@@ -99,9 +99,9 @@ def extract_reasoning_and_calls(chunks: list):
                 if tool_call.function.arguments:
                     arguments[tool_call_idx] += tool_call.function.arguments
         else:
-            if hasattr(chunk.choices[0].delta, "reasoning_content"):
-                reasoning_content += chunk.choices[0].delta.reasoning_content
-    return reasoning_content, arguments, function_names
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
 
 
 # test streaming
@@ -119,8 +119,8 @@ async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     async for chunk in stream:
         chunks.append(chunk)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
-    assert len(reasoning_content) > 0
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    assert len(reasoning) > 0
     assert len(function_names) > 0 and function_names[0] == FUNC_NAME
     assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
 
@@ -136,6 +136,6 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
         stream=False,
     )
 
-    assert len(tool_calls.choices[0].message.reasoning_content) > 0
+    assert len(tool_calls.choices[0].message.reasoning) > 0
     assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
     assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 6d8db361a57d4..53369f074eca8 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -180,8 +180,8 @@ async def test_function_tool_use(
             extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
         )
         if enable_thinking:
-            assert chat_completion.choices[0].message.reasoning_content is not None
-            assert chat_completion.choices[0].message.reasoning_content != ""
+            assert chat_completion.choices[0].message.reasoning is not None
+            assert chat_completion.choices[0].message.reasoning != ""
         assert chat_completion.choices[0].message.tool_calls is not None
         assert len(chat_completion.choices[0].message.tool_calls) > 0
     else:
@@ -200,9 +200,9 @@ async def test_function_tool_use(
         async for chunk in output_stream:
             if chunk.choices:
                 if enable_thinking and getattr(
-                    chunk.choices[0].delta, "reasoning_content", None
+                    chunk.choices[0].delta, "reasoning", None
                 ):
-                    reasoning.append(chunk.choices[0].delta.reasoning_content)
+                    reasoning.append(chunk.choices[0].delta.reasoning)
                 if chunk.choices[0].delta.tool_calls:
                     output.extend(chunk.choices[0].delta.tool_calls)
 
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index a85418d5b5f4e..b05fa379c69fc 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -40,6 +40,7 @@ class MockModelConfig:
     tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processors: list[str] | None = None
     logits_processor_pattern: str | None = None
     diff_sampling_param: dict | None = None
     allowed_local_media_path: str = ""
diff --git a/tests/entrypoints/anthropic/test_messages.py b/tests/entrypoints/openai/test_messages.py
similarity index 68%
rename from tests/entrypoints/anthropic/test_messages.py
rename to tests/entrypoints/openai/test_messages.py
index 4e35554b4e330..3e390ad496428 100644
--- a/tests/entrypoints/anthropic/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -5,7 +5,7 @@ import anthropic
 import pytest
 import pytest_asyncio
 
-from ...utils import RemoteAnthropicServer
+from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen3-0.6B"
 
@@ -23,13 +23,13 @@ def server():  # noqa: F811
         "claude-3-7-sonnet-latest",
     ]
 
-    with RemoteAnthropicServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
 async def client(server):
-    async with server.get_async_client() as async_client:
+    async with server.get_async_client_anthropic() as async_client:
         yield async_client
 
 
@@ -105,37 +105,37 @@ async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
 
     print(f"Anthropic response: {resp.model_dump_json()}")
 
-    @pytest.mark.asyncio
-    async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
-        resp = await client.messages.create(
-            model="claude-3-7-sonnet-latest",
-            max_tokens=1024,
-            messages=[
-                {
-                    "role": "user",
-                    "content": "What's the weather like in New York today?",
-                }
-            ],
-            tools=[
-                {
-                    "name": "get_current_weather",
-                    "description": "Useful for querying the weather "
-                    "in a specified city.",
-                    "input_schema": {
-                        "type": "object",
-                        "properties": {
-                            "location": {
-                                "type": "string",
-                                "description": "City or region, for example: "
-                                "New York, London, Tokyo, etc.",
-                            }
-                        },
-                        "required": ["location"],
-                    },
-                }
-            ],
-            stream=True,
-        )
 
-        async for chunk in resp:
-            print(chunk.model_dump_json())
+@pytest.mark.asyncio
+async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "What's the weather like in New York today?",
+            }
+        ],
+        tools=[
+            {
+                "name": "get_current_weather",
+                "description": "Useful for querying the weather in a specified city.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City or region, for example: "
+                            "New York, London, Tokyo, etc.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        ],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk.model_dump_json())
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
new file mode 100644
index 0000000000000..d32cfde07c21e
--- /dev/null
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[True])
+def server(request, monkeypatch_module):
+    use_v1 = request.param
+    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    client = openai.OpenAI(
+        api_key="EMPTY",
+        base_url=f"http://localhost:{server.port}/v1",
+        default_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    response_with_raw = raw_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        extra_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = response_with_raw.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_completion_with_orca_header(client: openai.AsyncOpenAI):
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    completion = await raw_client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = completion.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_single_completion(client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+        temperature=0.0,
+    )
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 2f678a0535cc6..f951b57fe7269 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -232,9 +232,9 @@ def test_reasoning_parser():
             assert isinstance(line_dict, dict)
             assert line_dict["error"] is None
 
-            # Check that reasoning_content is present and not empty
-            reasoning_content = line_dict["response"]["body"]["choices"][0]["message"][
-                "reasoning_content"
+            # Check that reasoning is present and not empty
+            reasoning = line_dict["response"]["body"]["choices"][0]["message"][
+                "reasoning"
             ]
-            assert reasoning_content is not None
-            assert len(reasoning_content) > 0
+            assert reasoning is not None
+            assert len(reasoning) > 0
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 1b83ed7e31e78..dd10384a7e8c0 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -353,6 +353,7 @@ class MockModelConfig:
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processors: list[str] | None = None
     logits_processor_pattern = None
     diff_sampling_param: dict | None = None
     allowed_local_media_path: str = ""
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 6ef932392d095..88580ed899f1a 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -65,6 +65,41 @@ async def test_basic_audio(mary_had_lamb, model_name):
         assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "mary had a little lamb" in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
 @pytest.mark.asyncio
 async def test_basic_audio_gemma(foscolo):
     # Gemma accuracy on some of the audio samples we use is particularly bad,
@@ -72,7 +107,9 @@ async def test_basic_audio_gemma(foscolo):
     model_name = "google/gemma-3n-E2B-it"
     server_args = ["--enforce-eager"]
 
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=480
+    ) as remote_server:
         client = remote_server.get_async_client()
         transcription = await client.audio.transcriptions.create(
             model=model_name,
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index f35742e166fe0..d7d407484f16d 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -48,6 +48,40 @@ async def test_non_asr_model(foscolo):
         assert err["message"] == "The model does not support Translations API"
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # NOTE - careful to call this test before the module scoped server
+    # fixture, otherwise it'll OOMkill the CI
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            extra_body=dict(language="en", to_language="es"),
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "pequeño" in out.split(" ")
+
+
 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
 async def test_basic_audio(foscolo, client_and_model):
diff --git a/tests/entrypoints/anthropic/__init__.py b/tests/entrypoints/sagemaker/__init__.py
similarity index 100%
rename from tests/entrypoints/anthropic/__init__.py
rename to tests/entrypoints/sagemaker/__init__.py
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
new file mode 100644
index 0000000000000..4c859c2527d25
--- /dev/null
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Shared fixtures and utilities for SageMaker tests."""
+
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# Model name constants used across tests
+MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
+LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
+
+# SageMaker header constants
+HEADER_SAGEMAKER_CLOSED_SESSION_ID = "X-Amzn-SageMaker-Closed-Session-Id"
+HEADER_SAGEMAKER_SESSION_ID = "X-Amzn-SageMaker-Session-Id"
+HEADER_SAGEMAKER_NEW_SESSION_ID = "X-Amzn-SageMaker-New-Session-Id"
+
+
+@pytest.fixture(scope="session")
+def smollm2_lora_files():
+    """Download LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=LORA_ADAPTER_NAME_SMOLLM)
+
+
+@pytest.fixture(scope="module")
+def basic_server_with_lora(smollm2_lora_files):
+    """Basic server fixture with standard configuration."""
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--max-lora-rank",
+        "256",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+    with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def async_client(basic_server_with_lora: RemoteOpenAIServer):
+    """Async OpenAI client fixture for use with basic_server."""
+    async with basic_server_with_lora.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
new file mode 100644
index 0000000000000..0d4f8e885824a
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
@@ -0,0 +1,734 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration tests for handler override functionality.
+
+Tests real customer usage scenarios:
+- Using @custom_ping_handler and @custom_invocation_handler decorators
+  to override handlers
+- Setting environment variables for handler specifications
+- Writing customer scripts with custom_sagemaker_ping_handler() and
+  custom_sagemaker_invocation_handler() functions
+- Priority: env vars > decorators > customer script files > framework
+  defaults
+
+Note: These tests focus on validating server responses rather than directly calling
+get_ping_handler() and get_invoke_handler() to ensure full integration testing.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestHandlerOverrideIntegration:
+    """Integration tests simulating real customer usage scenarios.
+
+    Each test simulates a fresh server startup where customers:
+    - Use @custom_ping_handler and @custom_invocation_handler decorators
+    - Set environment variables (CUSTOM_FASTAPI_PING_HANDLER, etc.)
+    - Write customer scripts with custom_sagemaker_ping_handler() and
+      custom_sagemaker_invocation_handler() functions
+    """
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+        self._clear_env_vars()
+
+    def teardown_method(self):
+        """Cleanup after each test."""
+        self._clear_env_vars()
+
+    def _clear_caches(self):
+        """Clear handler registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.handler import (
+                handler_registry,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            handler_registry.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    def _clear_env_vars(self):
+        """Clear SageMaker environment variables."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            # Clear SageMaker env vars
+            for var in [
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME,
+            ]:
+                os.environ.pop(var, None)
+
+            # Clear FastAPI env vars
+            for var in [
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER,
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER,
+            ]:
+                os.environ.pop(var, None)
+        except ImportError:
+            pass
+
+    @pytest.mark.asyncio
+    async def test_customer_script_functions_auto_loaded(self):
+        """Test customer scenario: script functions automatically override
+        framework defaults."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with ping() and invoke() functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "customer_override", 
+        "message": "Custom ping from customer script"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Custom response from customer script"],
+        "source": "customer_override"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Customer sets SageMaker environment variables to point to their script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Customer tests their server and sees their overrides work
+                # automatically
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their functions are used
+                assert ping_data["source"] == "customer_override"
+                assert ping_data["message"] == "Custom ping from customer script"
+                assert invoke_data["source"] == "customer_override"
+                assert invoke_data["predictions"] == [
+                    "Custom response from customer script"
+                ]
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_customer_decorator_usage(self):
+        """Test customer scenario: using @custom_ping_handler and
+        @custom_invocation_handler decorators."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+@sagemaker_standards.custom_ping_handler
+async def my_ping():
+    return {
+        "type": "ping",
+        "source": "customer_decorator"
+    }
+
+@sagemaker_standards.custom_invocation_handler  
+async def my_invoke(request: Request):
+    return {
+        "type": "invoke", 
+        "source": "customer_decorator"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their handlers are used by the server
+                assert ping_data["source"] == "customer_decorator"
+                assert invoke_data["source"] == "customer_decorator"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_handler_priority_order(self):
+        """Test priority: @custom_ping_handler/@custom_invocation_handler
+        decorators vs script functions."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script with both decorator and regular functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+# Customer uses @custom_ping_handler decorator (higher priority than script functions)
+@sagemaker_standards.custom_ping_handler
+async def decorated_ping():
+    return {
+        "status": "healthy",
+        "source": "ping_decorator_in_script", 
+        "priority": "decorator"
+    }
+
+# Customer also has a regular function (lower priority than
+# @custom_ping_handler decorator)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_function",
+        "priority": "function"
+    }
+
+# Customer has a regular invoke function
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke_function",
+        "priority": "function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # @custom_ping_handler decorator has higher priority than
+                # script function
+                assert ping_data["source"] == "ping_decorator_in_script"
+                assert ping_data["priority"] == "decorator"
+
+                # Script function is used for invoke
+                assert invoke_data["source"] == "script_invoke_function"
+                assert invoke_data["priority"] == "function"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_environment_variable_script_loading(self):
+        """Test that environment variables correctly specify script location
+        and loading."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script in a specific directory
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Loaded via environment variables"],
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Test environment variable script loading
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Verify that the script was loaded via environment variables
+                assert ping_data["source"] == "env_loaded_script"
+                assert ping_data["method"] == "environment_variable_loading"
+                assert invoke_data["source"] == "env_loaded_script"
+                assert invoke_data["method"] == "environment_variable_loading"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_framework_default_handlers(self):
+        """Test that framework default handlers work when no customer
+        overrides exist."""
+        args = [
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "2048",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "32",
+        ]
+
+        # Explicitly pass empty env_dict to ensure no SageMaker env vars are set
+        # This prevents pollution from previous tests
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            env_dict = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: "",
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: "",
+            }
+        except ImportError:
+            env_dict = {}
+
+        with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=env_dict) as server:
+            # Test that default ping works
+            ping_response = requests.get(server.url_for("ping"))
+            assert ping_response.status_code == 200
+
+            # Test that default invocations work
+            invoke_response = requests.post(
+                server.url_for("invocations"),
+                json={
+                    "model": MODEL_NAME_SMOLLM,
+                    "messages": [{"role": "user", "content": "Hello"}],
+                    "max_tokens": 5,
+                },
+            )
+            assert invoke_response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_handler_env_var_override(self):
+        """Test CUSTOM_FASTAPI_PING_HANDLER and CUSTOM_FASTAPI_INVOCATION_HANDLER
+        environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with both env var handlers and script functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request, Response
+import json
+
+async def env_var_ping_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var_ping",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_var_invoke_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var_invoke",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_ping",
+        "method": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke",
+        "method": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to override both handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_var_ping_handler"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_var_invoke_handler"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler override
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable should override script function
+                assert ping_data["method"] == "environment_variable"
+                assert ping_data["source"] == "env_var_ping"
+
+                # Test invocation handler override
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable should override script function
+                assert invoke_data["method"] == "environment_variable"
+                assert invoke_data["source"] == "env_var_invoke"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_env_var_priority_over_decorator_and_script(self):
+        """Test that environment variables have highest priority over decorators
+        and script functions for both ping and invocation handlers."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with all three handler types for both ping and invocation
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request, Response
+import json
+
+# Environment variable handlers (highest priority)
+async def env_priority_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_priority_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+# Decorator handlers (medium priority)
+@sagemaker_standards.custom_ping_handler
+async def decorator_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+@sagemaker_standards.custom_invocation_handler
+async def decorator_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Decorator response"],
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+# Script functions (lowest priority)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script",
+        "priority": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script",
+        "priority": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to specify highest priority handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_priority_ping"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_priority_invoke"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler priority
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert ping_data["priority"] == "environment_variable"
+                assert ping_data["source"] == "env_var"
+
+                # Test invocation handler priority
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert invoke_data["priority"] == "environment_variable"
+                assert invoke_data["source"] == "env_var"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
new file mode 100644
index 0000000000000..a2867efdc5840
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai  # use the official async_client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import MODEL_NAME_SMOLLM
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # The SageMaker standards library creates a POST /adapters endpoint
+    # that maps to the load_lora_adapter handler with request shape:
+    # {"lora_name": "body.name", "lora_path": "body.src"}
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "smollm2-lora-sagemaker", "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    models = await async_client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == smollm2_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME_SMOLLM
+    assert dynamic_lora_model.id == "smollm2-lora-sagemaker"
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter
+    adapter_name = "smollm2-lora-sagemaker-unload"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Verify it's in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name in adapter_ids
+
+    # Now unload it using DELETE /adapters/{adapter_name}
+    # The SageMaker standards maps this to unload_lora_adapter with:
+    # {"lora_name": "path_params.adapter_name"}
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", adapter_name),
+    )
+    unload_response.raise_for_status()
+
+    # Verify it's no longer in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name not in adapter_ids
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_not_found(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "nonexistent-adapter", "src": "/path/does/not/exist"},
+    )
+    assert load_response.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_invalid_files(
+    basic_server_with_lora: RemoteOpenAIServer,
+    tmp_path,
+):
+    invalid_files = tmp_path / "invalid_adapter"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("not valid json")
+
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "invalid-adapter", "src": str(invalid_files)},
+    )
+    assert load_response.status_code == 400
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_nonexistent_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    # Attempt to unload an adapter that doesn't exist
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", "nonexistent-adapter-name"),
+    )
+    assert unload_response.status_code in (400, 404)
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_invocations_with_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter via SageMaker endpoint
+    adapter_name = "smollm2-lora-invoke-test"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Now test the /invocations endpoint with the adapter
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={
+            "X-Amzn-SageMaker-Adapter-Identifier": adapter_name,
+        },
+        json={
+            "prompt": "Hello, how are you?",
+            "max_tokens": 10,
+        },
+    )
+    invocation_response.raise_for_status()
+    invocation_output = invocation_response.json()
+
+    # Verify we got a valid completion response
+    assert "choices" in invocation_output
+    assert len(invocation_output["choices"]) > 0
+    assert "text" in invocation_output["choices"][0]
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_multiple_adapters_load_unload(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    adapter_names = [f"sagemaker-adapter-{i}" for i in range(5)]
+
+    # Load all adapters
+    for adapter_name in adapter_names:
+        load_response = requests.post(
+            basic_server_with_lora.url_for("adapters"),
+            json={"name": adapter_name, "src": smollm2_lora_files},
+        )
+        load_response.raise_for_status()
+
+    # Verify all are in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name in adapter_ids
+
+    # Unload all adapters
+    for adapter_name in adapter_names:
+        unload_response = requests.delete(
+            basic_server_with_lora.url_for("adapters", adapter_name),
+        )
+        unload_response.raise_for_status()
+
+    # Verify all are removed from models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name not in adapter_ids
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
new file mode 100644
index 0000000000000..f1ed0c7e28973
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration test for middleware loader functionality.
+
+Tests that customer middlewares get called correctly with a vLLM server.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestMiddlewareIntegration:
+    """Integration test for middleware with vLLM server."""
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+
+    def _clear_caches(self):
+        """Clear middleware registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.fastapi.middleware import (
+                middleware_registry,
+            )
+            from model_hosting_container_standards.common.fastapi.middleware.source.decorator_loader import (  # noqa: E501
+                decorator_loader,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            middleware_registry.clear_middlewares()
+            decorator_loader.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    @pytest.mark.asyncio
+    async def test_customer_middleware_with_vllm_server(self):
+        """Test that customer middlewares work with actual vLLM server.
+
+        Tests decorator-based middlewares (@custom_middleware, @input_formatter,
+        @output_formatter)
+        on multiple endpoints (chat/completions, invocations).
+        """
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script with multiple decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware, input_formatter, output_formatter
+)
+
+# Global flag to track if input formatter was called
+_input_formatter_called = False
+
+@input_formatter
+async def customer_input_formatter(request):
+    # Process input - mark that input formatter was called
+    global _input_formatter_called
+    _input_formatter_called = True
+    return request
+
+@custom_middleware("throttle")
+async def customer_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Customer-Throttle"] = "applied"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "throttle,"
+    return response
+
+@output_formatter
+async def customer_output_formatter(response):
+    global _input_formatter_called
+    response.headers["X-Customer-Processed"] = "true"
+    # Since input_formatter and output_formatter are combined into
+    # pre_post_process middleware,
+    # if output_formatter is called, input_formatter should have been called too
+    if _input_formatter_called:
+        response.headers["X-Input-Formatter-Called"] = "true"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "output_formatter,"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to point to customer script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test 1: Middlewares applied to chat/completions endpoint
+                chat_response = requests.post(
+                    server.url_for("v1/chat/completions"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert chat_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in chat_response.headers
+                assert chat_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in chat_response.headers
+                assert chat_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in chat_response.headers
+                assert chat_response.headers["X-Input-Formatter-Called"] == "true"
+
+                # Verify middleware execution order
+                execution_order = chat_response.headers.get(
+                    "X-Middleware-Order", ""
+                ).rstrip(",")
+                order_parts = execution_order.split(",") if execution_order else []
+                assert "throttle" in order_parts
+                assert "output_formatter" in order_parts
+
+                # Test 2: Middlewares applied to invocations endpoint
+                invocations_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert invocations_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in invocations_response.headers
+                assert (
+                    invocations_response.headers["X-Input-Formatter-Called"] == "true"
+                )
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_with_ping_endpoint(self):
+        """Test that middlewares work with SageMaker ping endpoint."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware
+)
+
+@custom_middleware("pre_post_process")
+async def ping_tracking_middleware(request, call_next):
+    response = await call_next(request)
+    if request.url.path == "/ping":
+        response.headers["X-Ping-Tracked"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping endpoint with middleware
+                response = requests.get(server.url_for("ping"))
+
+                assert response.status_code == 200
+                assert "X-Ping-Tracked" in response.headers
+                assert response.headers["X-Ping-Tracked"] == "true"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_env_var_override(self):
+        """Test middleware environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with middleware functions specified via env vars
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+# Global flag to track if pre_process was called
+_pre_process_called = False
+
+async def env_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Env-Throttle"] = "applied"
+    return response
+
+async def env_pre_process(request: Request) -> Request:
+    # Mark that pre_process was called
+    global _pre_process_called
+    _pre_process_called = True
+    return request
+
+async def env_post_process(response):
+    global _pre_process_called
+    if hasattr(response, 'headers'):
+        response.headers["X-Env-Post-Process"] = "applied"
+        # Since pre_process and post_process are combined into
+        # pre_post_process middleware,
+        # if post_process is called, pre_process should have been called too
+        if _pre_process_called:
+            response.headers["X-Pre-Process-Called"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables for middleware
+            # Use script_name with .py extension as per plugin example
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_MIDDLEWARE_THROTTLE: (
+                    f"{script_name}:env_throttle_middleware"
+                ),
+                FastAPIEnvVars.CUSTOM_PRE_PROCESS: f"{script_name}:env_pre_process",
+                FastAPIEnvVars.CUSTOM_POST_PROCESS: f"{script_name}:env_post_process",
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                response = requests.get(server.url_for("ping"))
+                assert response.status_code == 200
+
+                # Check if environment variable middleware was applied
+                headers = response.headers
+
+                # Verify that env var middlewares were applied
+                assert "X-Env-Throttle" in headers, (
+                    "Throttle middleware should be applied via env var"
+                )
+                assert headers["X-Env-Throttle"] == "applied"
+
+                assert "X-Env-Post-Process" in headers, (
+                    "Post-process middleware should be applied via env var"
+                )
+                assert headers["X-Env-Post-Process"] == "applied"
+
+                # Verify that pre_process was called
+                assert "X-Pre-Process-Called" in headers, (
+                    "Pre-process should be called via env var"
+                )
+                assert headers["X-Pre-Process-Called"] == "true"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
new file mode 100644
index 0000000000000..6206000385bd9
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import openai  # use the official client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    HEADER_SAGEMAKER_CLOSED_SESSION_ID,
+    HEADER_SAGEMAKER_NEW_SESSION_ID,
+    HEADER_SAGEMAKER_SESSION_ID,
+    MODEL_NAME_SMOLLM,
+)
+
+CLOSE_BADREQUEST_CASES = [
+    (
+        "nonexistent_session_id",
+        {"session_id": "nonexistent-session-id"},
+        {},
+        "session not found",
+    ),
+    ("malformed_close_request", {}, {"extra-field": "extra-field-data"}, None),
+]
+
+
+@pytest.mark.asyncio
+async def test_create_session_badrequest(basic_server_with_lora: RemoteOpenAIServer):
+    bad_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        json={"requestType": "NEW_SESSION", "extra-field": "extra-field-data"},
+    )
+
+    assert bad_response.status_code == 400
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_name,session_id_change,request_body_change,expected_error",
+    CLOSE_BADREQUEST_CASES,
+)
+async def test_close_session_badrequest(
+    basic_server_with_lora: RemoteOpenAIServer,
+    test_name: str,
+    session_id_change: dict[str, str],
+    request_body_change: dict[str, str],
+    expected_error: str | None,
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    if request_body_change:
+        close_request_json.update(request_body_change)
+    bad_session_id = session_id_change.get("session_id")
+    bad_close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: bad_session_id or valid_session_id},
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert bad_close_response.status_code == 400
+    if expected_error:
+        assert expected_error in bad_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_close_session_invalidrequest(
+    basic_server_with_lora: RemoteOpenAIServer, async_client: openai.AsyncOpenAI
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    invalid_close_response = requests.post(
+        url,
+        # no headers to specify session_id
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert invalid_close_response.status_code == 424
+    assert "invalid session_id" in invalid_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_session(basic_server_with_lora: RemoteOpenAIServer):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    # test invocation with session id
+
+    request_args = {
+        "model": MODEL_NAME_SMOLLM,
+        "prompt": "what is 1+1?",
+        "max_completion_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": False,
+    }
+
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json=request_args,
+    )
+    invocation_response.raise_for_status()
+
+    # close created session, should succeed
+    close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    close_response.raise_for_status()
+
+    assert (
+        close_response.headers.get(HEADER_SAGEMAKER_CLOSED_SESSION_ID)
+        == valid_session_id
+    )
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index c5dbceeeb2b45..9297bf6ddf2d3 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+max_model_len: 4096
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index ce3ab8096b45c..b5d67df7bf3db 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -62,9 +62,11 @@ def test_gsm8k_correctness_param(config_filename, tp_size):
         str(tp_size),
     ]
 
+    env_dict = eval_config.get("env", None)
+
     # Launch server and run evaluation
     with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, max_wait_seconds=480
+        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
     ) as remote_server:
         server_url = remote_server.url_for("v1")
 
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 48a42ce6ffab5..8149ce7672cdc 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -104,13 +104,6 @@ def test_env(
                                 16, torch.float16, None, block_size, use_mla=use_mla
                             )
                         assert f"The selected backend, {name}" in str(exc_info.value)
-                    elif name == "ROCM_AITER_MLA" and block_size != 1:
-                        # ROCM_AITER_MLA only supports block_size == 1
-                        with pytest.raises(ValueError) as exc_info:
-                            get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                        assert f"The selected backend, {name}" in str(exc_info.value)
                     else:
                         # Valid backend-block_size combination
                         backend = get_attn_backend(
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 18995545552ea..6e5468969bf25 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -9,7 +9,6 @@ from vllm.platforms import current_platform
 from vllm.vllm_flash_attn import (
     fa_version_unsupported_reason,
     flash_attn_varlen_func,
-    flash_attn_with_kvcache,
     is_fa_version_supported,
 )
 
@@ -83,124 +82,6 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
-@pytest.mark.parametrize("use_out", [True, False])
-@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
-@pytest.mark.parametrize("fa_version", [2, 3])
-@pytest.mark.parametrize("q_dtype", QDTYPES)
-@torch.inference_mode()
-def test_flash_attn_with_paged_kv(
-    use_out: bool,
-    kv_lens: list[int],
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    block_size: int,
-    soft_cap: float | None,
-    num_blocks: int,
-    sliding_window: int | None,
-    fa_version: int,
-    q_dtype: torch.dtype | None,
-) -> None:
-    torch.set_default_device("cuda")
-    if not is_fa_version_supported(fa_version):
-        pytest.skip(
-            f"Flash attention version {fa_version} not supported due "
-            f'to: "{fa_version_unsupported_reason(fa_version)}"'
-        )
-    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip(
-            "Flash attention with quantized inputs is only "
-            "supported on version 3 with bfloat16 base type"
-        )
-
-    current_platform.seed_everything(0)
-    num_seqs = len(kv_lens)
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
-    max_kv_len = max(kv_lens)
-    scale = head_size**-0.5
-    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
-
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    key_cache = torch.randn(
-        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
-    )
-    value_cache = torch.randn_like(key_cache)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
-
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(
-        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
-    )
-
-    q = query.unsqueeze(1)
-    out = torch.empty_like(q) if use_out else None
-
-    maybe_quantized_query = q
-    maybe_quantized_key_cache = key_cache
-    maybe_quantized_value_cache = value_cache
-    q_descale = None
-    k_descale = None
-    v_descale = None
-    if q_dtype is not None:
-        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
-        maybe_quantized_query = q.to(q_dtype)
-        maybe_quantized_key_cache = key_cache.to(q_dtype)
-        maybe_quantized_value_cache = value_cache.to(q_dtype)
-
-        scale_shape = (num_seqs, num_kv_heads)
-        q_descale = torch.ones(scale_shape, dtype=torch.float32)
-        k_descale = torch.ones(scale_shape, dtype=torch.float32)
-        v_descale = torch.ones(scale_shape, dtype=torch.float32)
-
-    output = flash_attn_with_kvcache(
-        q=maybe_quantized_query,
-        k_cache=maybe_quantized_key_cache,
-        v_cache=maybe_quantized_value_cache,
-        out=out,
-        softmax_scale=scale,
-        causal=True,
-        block_table=block_tables,
-        cache_seqlens=kv_lens_tensor,
-        softcap=soft_cap if soft_cap is not None else 0,
-        window_size=window_size,
-        fa_version=fa_version,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-    )
-    output = output if not use_out else out
-    output = output.squeeze(1)
-
-    atol, rtol = 1.5e-2, 1e-2
-    if q_dtype is not None:
-        atol, rtol = 1.5e-1, 1.5e-1
-
-    ref_output = ref_paged_attn(
-        query=query,
-        key_cache=key_cache,
-        value_cache=value_cache,
-        query_lens=[1] * num_seqs,
-        kv_lens=kv_lens,
-        block_tables=block_tables,
-        scale=scale,
-        soft_cap=soft_cap,
-        sliding_window=sliding_window,
-    )
-    (
-        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
-        f"{torch.max(torch.abs(output - ref_output))}",
-    )
-
-
 @pytest.mark.parametrize("use_out", [True, False])
 @pytest.mark.parametrize(
     "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 79981009c9db0..693b849ebc5d7 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -238,9 +238,11 @@ def test_flashinfer_trtllm_decode_with_baseline(
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
         rtol, atol = 7e-2, 9e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
-        rtol, atol = 2e-2, 4e-2
+        rtol, atol = 3e-2, 4e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
-        rtol, atol = 1e-2, 2e-2
+        rtol, atol = 2e-2, 2e-2
+    elif kv_quant_dtype == FP8_DTYPE:
+        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 63b5a37d3c779..b5fc653ca7353 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -11,7 +11,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 
 DTYPES = [torch.bfloat16, torch.float]
 QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
-VEC_HIDDEN_SIZES = range(1024, 1030)
+VEC_HIDDEN_SIZES = [1024, 1025, 1027, 1029]
 # Avoid combinatorial explosion with full Cartesian product
 NUM_TOKENS_HIDDEN_SIZES = [
     *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
@@ -65,7 +65,7 @@ def ref_dynamic_per_token_quant(
         )
     else:
         assert quant_dtype == torch.int8
-        torch_out, scales = ops.scaled_int8_quant(torch_out)
+        torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
 
     return torch_out, scales, residual
 
@@ -109,7 +109,7 @@ def ops_impl(
 
 @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
-@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("has_scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
@@ -119,7 +119,7 @@ def test_rms_norm(
     num_tokens: int,
     hidden_size: int,
     add_residual: bool,
-    scale_ub: bool,
+    has_scale_ub: bool,
     dtype: torch.dtype,
     quant_dtype: torch.dtype,
     seed: int,
@@ -130,7 +130,7 @@ def test_rms_norm(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
-    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+    if has_scale_ub and quant_dtype != torch.float8_e4m3fn:
         # skip
         return
 
@@ -143,9 +143,11 @@ def test_rms_norm(
     scale = 1 / (hidden_size)
     x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
     residual = torch.randn_like(x) * scale if add_residual else None
-    if scale_ub is not None:
+    if has_scale_ub:
         rms_x, _ = ref_rms_norm(layer, x, residual)
         scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")
+    else:
+        scale_ub = None
 
     ref_out, ref_scales, ref_residual = ref_impl(
         layer, x, quant_dtype, residual, scale_ub
@@ -156,14 +158,27 @@ def test_rms_norm(
 
     assert ref_out.dtype == quant_dtype
     assert ops_out.dtype == quant_dtype
-    assert torch.allclose(ref_scales, ops_scales)
     if quant_dtype == torch.int8:
+        assert torch.allclose(ref_scales, ops_scales, atol=1e-6)
         # big atol to account for round-off errors.
         assert torch.allclose(ref_out, ops_out, atol=1)
     else:
-        assert torch.allclose(
-            ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
-        )
+        assert torch.allclose(ref_scales, ops_scales)
+        a = ref_out.to(dtype=torch.float32)
+        b = ops_out.to(dtype=torch.float32)
+        ok = torch.allclose(a, b)
+        if not ok:
+            # fallback: compare dequantized values with relaxed tolerance
+            a_deq = a * ref_scales.view(-1, 1)
+            b_deq = b * ops_scales.view(-1, 1)
+            # NOTE: It is possible that some future test cases trigger this
+            # max diff due to precision issues. If such an error is
+            # encountered, it's recommended to inspect the differences between
+            # all corresponding elements from each tensor (e.g. by looping over
+            # them) and checking how many the max diff error shows up on (just
+            # a few bad elements should still be considered acceptable).
+            ok = torch.allclose(a_deq, b_deq, rtol=5e-2, atol=5e-2)
+        assert ok
     if add_residual:
         assert torch.allclose(ref_residual, ops_residual)
 
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index c59fc7af0c897..98edc959957d0 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -179,6 +179,10 @@ def selective_scan_opcheck_fn(
     has_initial_state=None,
     ssm_states=None,
     pad_slot_id=PAD_SLOT_ID,
+    block_size=2048,
+    block_idx_first_scheduled_token=None,
+    block_idx_last_scheduled_token=None,
+    initial_state_idx=None,
 ):
     """if return_last_state is True, returns (out, last_state)
     last_state has shape (batch, dim, dstate).
@@ -223,6 +227,10 @@ def selective_scan_opcheck_fn(
             has_initial_state,
             ssm_states,
             pad_slot_id,
+            block_size,
+            block_idx_first_scheduled_token,
+            block_idx_last_scheduled_token,
+            initial_state_idx,
         ),
         test_utils=["test_schema", "test_faketensor"],
     )
@@ -338,6 +346,11 @@ def test_selective_scan(
             has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool)
             if c > 0
             else None,
+            pad_slot_id=PAD_SLOT_ID,
+            block_size=2048,
+            block_idx_first_scheduled_token=None,
+            block_idx_last_scheduled_token=None,
+            initial_state_idx=None,
         )
         outs.append(out)
     if len(outs) > 1:
@@ -372,6 +385,7 @@ def test_selective_scan(
         delta_bias=delta_bias,
         delta_softplus=delta_softplus,
         ssm_states=state,
+        block_size=2048,
     )
 
 
@@ -586,6 +600,7 @@ def test_selective_scan_varlen(
         padded_state_indices,
         has_initial_state,
         prev_state,
+        block_size=2048,
     )
 
 
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index f985f9ac7ca67..707068b2bbdc2 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -6,7 +6,10 @@ import pytest
 import torch
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    fp8_w8a8_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
@@ -22,10 +25,10 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
-    100
+    90
 ):
     pytest.skip(
-        "Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+        "Supported for sm >= 90",
         allow_module_level=True,
     )
 
@@ -131,6 +134,8 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
     topk: int,
     monkeypatch,
 ):
+    if not current_platform.has_device_capability(100):
+        pytest.skip("Test is only supported for sm >= 100")
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
@@ -184,9 +189,6 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
         torch.testing.assert_close(output, flashinfer_output, atol=5.5e-2, rtol=1e-2)
 
 
-@pytest.mark.skip(
-    "Requires flashinfer version that contains https://github.com/flashinfer-ai/flashinfer/pull/1472"
-)
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
@@ -216,9 +218,13 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
 
         quant_config = fp8_w8a8_moe_quant_config(
             w1_scale=td.w13_weight_scale,
+            g1_alphas=(td.w13_weight_scale * td.a1_scale).squeeze(),
             w2_scale=td.w2_weight_scale,
+            g2_alphas=(td.w2_weight_scale * td.a2_scale).squeeze(),
             a1_scale=td.a1_scale,
+            a1_gscale=td.a1_scale,
             a2_scale=td.a2_scale,
+            a2_gscale=1.0 / td.a2_scale,
             per_act_token_quant=False,
         )
 
@@ -238,6 +244,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
 
         td.layer.dp_size = 1
 
+        def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
+            return quant_config
+
+        td.layer.get_fused_moe_quant_config = get_fused_moe_quant_config
+        td.layer.quant_method = td.layer
+
         flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
             td.hidden_states,
             td.layer,
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 014df1fa111f2..c27cf2468ede5 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -6,6 +6,8 @@ Run `pytest tests/kernels/test_moe.py`.
 """
 
 import functools
+import importlib
+import sys
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any
@@ -20,6 +22,7 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.forward_context import set_forward_context
@@ -412,14 +415,12 @@ def test_mixtral_moe(
     huggingface."""
 
     # clear the cache before every test
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        is_rocm_aiter_moe_enabled,
-    )
+    # Force reload aiter_ops to pick up the new environment variables.
+    if "rocm_aiter_ops" in sys.modules:
+        importlib.reload(rocm_aiter_ops)
 
-    is_rocm_aiter_moe_enabled.cache_clear()
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
-
         if dtype == torch.float32:
             pytest.skip("AITER ROCm test skip for float32")
 
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 97a55c37b9a3e..420dbbffaac08 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -25,6 +25,7 @@ CASES = [
     (8, 16, 128 * 2, fp8_dtype),
     (8, 16, 128 * 3, fp8_dtype),
     (8, 64, 7168, fp8_dtype),
+    (8, 128, 128 * 33, fp8_dtype),
     (8, 128, 7168, fp8_dtype),
     (8, 512, 7168, fp8_dtype),
     (8, 1024, 7168, fp8_dtype),
@@ -54,8 +55,10 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
     )
 
     # Run the SiLU V2 kernel
+    # TODO (varun): use_e8m0 is set to false as the reference impl does
+    # not handle that case.
     y_q, y_s = persistent_masked_m_silu_mul_quant(
-        y, tokens_per_expert, group_size=group_size
+        y, tokens_per_expert, group_size=group_size, use_ue8m0=False
     )
 
     torch.cuda.synchronize()
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index e9b091d06697e..12f1008ecf27f 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -168,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
     out, out_scale = ops.scaled_fp4_quant(x, global_scale)
-
     scale_ans = recover_swizzled_scales(out_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
-
     torch.testing.assert_close(out_ans, out_ref)
     torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index eb00bc72b4b0d..5d5a26fbfc2cd 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -4,24 +4,21 @@
 
 import itertools
 import random
-import unittest
 from collections.abc import Sequence
 from numbers import Number
 from typing import Any, NamedTuple
+from unittest.mock import patch
 
 import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.registry import _Backend
+from vllm.attention import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils import (
     STR_BACKEND_ENV_VAR,
-    STR_FLASH_ATTN_VAL,
-    STR_XFORMERS_ATTN_VAL,
 )
 from vllm.utils.torch_utils import make_tensor_with_pad
 
@@ -512,50 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_backend(backend_name: str) -> AttentionBackend:
-    """
-    Construct the backend instance determined by the backend_name string
-    argument.
-
-    Note: at time of writing the Attention wrapper automatically selects
-    its own backend for Attention.forward(); so the backend instance which
-    you generate with this function is not meant to be used for *running*
-    inference, but rather for generating compatible metadata structures
-    using backend.make_metadata()
-
-
-    Returns:
-
-    * Backend instance
-    """
-    if backend_name == STR_XFORMERS_ATTN_VAL:
-        from vllm.v1.attention.backends.xformers import XFormersAttentionBackend
-
-        return XFormersAttentionBackend()
-    if backend_name == STR_FLASH_ATTN_VAL:
-        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
-
-        return FlashAttentionBackend()
-    if backend_name == "TRITON_ATTN":
-        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
-
-        return TritonAttentionBackend()
-    if backend_name == "FLEX_ATTENTION":
-        from vllm.v1.attention.backends.flex_attention import FlexAttentionBackend
-
-        return FlexAttentionBackend()
-    if backend_name == "TORCH_SDPA":
-        from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
-
-        return TorchSDPABackend()
-    if backend_name == "FLASHINFER":
-        from vllm.v1.attention.backends.flashinfer import FlashInferBackend
-
-        return FlashInferBackend()
-
-    raise AssertionError(f"Unrecognized backend_name {backend_name} for unit test")
-
-
 def make_alibi_bias(
     alibi_slopes: torch.Tensor,
     num_kv_heads: int,
@@ -877,197 +830,6 @@ def make_block_tables_slot_mapping(
     return (block_tables_tensor, slot_mapping_list, max_block_idx)
 
 
-def make_test_metadata(
-    attn_backend: _Backend,
-    is_prompt: bool,
-    seq_lens: list[int] | None,
-    decoder_test_params: PhaseTestParameters | None,
-    device: torch.device | str,
-    encoder_test_params: PhaseTestParameters | None = None,
-    cross_test_params: PhaseTestParameters | None = None,
-) -> AttentionMetadata:
-    """
-    Construct fake attention metadata for a given test phase
-    (prefill-phase or decode-phase).
-
-    encoder_test_params and cross_test_params arguments allow encoder
-    attention and enc/dec cross-attention (respectively) to use distinct
-    metadata values from decoder self-attention (decoder_test_params.)
-
-    if encoder_test_params and cross_test_params are None, the attention
-    metadata will support decoder-only scenario.
-
-    Assumptions:
-
-    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
-
-    Arguments:
-
-    * attn_backend_name: Backend for sourcing attention kernels
-    * is_prompt: prefill if True, o/w decode
-    * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params;
-                           this function requires
-                           kv_mmap (memory mapping) field
-    * device: CPU or CUDA device
-    * encoder_test_params: encoder attention test params;
-                           this function requires encoder query
-                           sequence lengths field. If None,
-                           encoder query sequence lengths are
-                           treated as None
-    * cross_test_params: enc/dec cross-attention test params;
-                         this function requires kv_mmap field.
-                         If None, KV cache memory map data
-                         structures are treated as None
-
-    Return:
-
-    * AttentionMetadata structure
-    """
-
-    # Decoder self-attention memory mapping
-    # decoder_test_params is None signals encoder-only
-    # scenario, so kv_mmap is None
-    kv_mmap = None if decoder_test_params is None else decoder_test_params.kv_mmap
-
-    # This function constructs metadata assuming no chunked prefill,
-    # i.e. 100% prefill tokens or 100% decode tokens
-    #
-    # - If is_prompt, num_prefills_or_decodes is the number of prefills
-    #   and num_prefill_or_decode_tokens is the number of prefill tokens
-    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
-    #   and num_prefill_or_decode_tokens is the number of decode tokens
-    #
-    # seq_lens is None signals encoder-only
-    # scenario, in which case num_prefills_or_decodes and
-    # num_prefill_or_decode_tokens are unused
-    num_prefills_or_decodes = None if seq_lens is None else len(seq_lens)
-
-    num_prefill_or_decode_tokens = (
-        None if seq_lens is None else (sum(seq_lens) if is_prompt else len(seq_lens))
-    )
-
-    # Seems for non-prefix-caching scenarios context_lens
-    # is never needed
-    context_lens = None
-
-    if encoder_test_params is None:
-        encoder_seq_lens = None
-        num_encoder_tokens = None
-    else:
-        # Encoder/decoder or encoder-only models only:
-        # * Extract encoder input sequence lengths
-        assert encoder_test_params.packed_qkvo.packed_qkv is not None
-        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
-        num_encoder_tokens = (
-            None if encoder_seq_lens is None else (sum(encoder_seq_lens))
-        )
-
-    # For encoder/decoder or encoder-only models only, extract *cross-attention*
-    # slot_mapping and block table (kv_mmap)
-    cross_kv_mmap = None if cross_test_params is None else cross_test_params.kv_mmap
-
-    attn_backend_obj = make_backend(attn_backend.name)
-
-    if is_prompt:
-        # Prefill-phase scenario
-
-        num_prefills = num_prefills_or_decodes
-        num_prefill_tokens = num_prefill_or_decode_tokens
-        num_decode_tokens = 0
-
-        (
-            seq_lens_tensor,
-            context_lens_tensor,
-            _,
-            _,
-            seq_start_loc,
-            encoder_seq_lens_tensor,
-            encoder_seq_start_loc,
-            max_encoder_seq_len,
-        ) = _make_metadata_tensors(
-            seq_lens, context_lens, encoder_seq_lens, device=device
-        )
-        return attn_backend_obj.make_metadata(
-            num_prefills=num_prefills,
-            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
-            enable_kv_scales_calculation=True,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            seq_start_loc=seq_start_loc,
-            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
-            max_decode_seq_len=0,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
-            use_cuda_graph=False,
-            num_encoder_tokens=num_encoder_tokens,
-            encoder_seq_lens=encoder_seq_lens,
-            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
-            encoder_seq_start_loc=encoder_seq_start_loc,
-            max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(
-                None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping
-            ),
-            cross_block_tables=(
-                None if cross_kv_mmap is None else cross_kv_mmap.block_tables
-            ),
-        )
-
-    else:  # not is_prompt
-        # Decode-phase scenario
-
-        assert kv_mmap is not None
-        assert num_prefill_or_decode_tokens is not None
-        assert seq_lens is not None
-
-        num_prefills = 0
-        num_prefill_tokens = 0
-        num_decode_tokens = num_prefill_or_decode_tokens
-
-        (
-            seq_lens_tensor,
-            context_lens_tensor,
-            _,
-            _,
-            seq_start_loc,
-            encoder_seq_lens_tensor,
-            encoder_seq_start_loc,
-            max_encoder_seq_len,
-        ) = _make_metadata_tensors(
-            seq_lens, context_lens, encoder_seq_lens, device=device
-        )
-
-        return attn_backend_obj.make_metadata(
-            num_prefills=num_prefills,
-            slot_mapping=kv_mmap.slot_mapping,
-            enable_kv_scales_calculation=True,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            seq_start_loc=seq_start_loc,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=max(seq_lens),
-            max_decode_query_len=1,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=kv_mmap.block_tables,
-            use_cuda_graph=False,
-            num_encoder_tokens=num_encoder_tokens,
-            encoder_seq_lens=encoder_seq_lens,
-            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
-            encoder_seq_start_loc=encoder_seq_start_loc,
-            max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(
-                None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping
-            ),
-            cross_block_tables=(
-                None if cross_kv_mmap is None else cross_kv_mmap.block_tables
-            ),
-        )
-
-
 def assert_actual_matches_ideal(
     test_params: PhaseTestParameters, output_under_test: torch.Tensor, backend: str
 ) -> None:
@@ -1308,7 +1070,7 @@ def opcheck(
     raise_exception: bool = True,
     cond: bool = True,
 ) -> dict[str, str]:
-    with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+    with patch("torch.allclose", new=fp8_allclose):
         return (
             torch.library.opcheck(
                 op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index b724e112b9dd3..91ab4a87c65f8 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -134,6 +134,8 @@ def use_fused_moe_lora_kernel(
     )
     expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32)
     num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32)
+    adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32)
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32)
 
     # call kernel
     ops.moe_lora_align_block_size(
@@ -147,6 +149,8 @@ def use_fused_moe_lora_kernel(
         sorted_token_ids,
         expert_ids,
         num_tokens_post_padded,
+        adapter_enabled,
+        lora_ids,
     )
 
     config = {
@@ -154,6 +158,8 @@ def use_fused_moe_lora_kernel(
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
+        "NUM_WARPS": 4,
+        "NUM_STAGES": 3,
         "SPLIT_K": 1,
     }
 
@@ -172,10 +178,21 @@ def use_fused_moe_lora_kernel(
         num_tokens_post_padded,
         max_lora_rank,
         top_k_num,
+        lora_ids,
+        adapter_enabled,
         config["BLOCK_SIZE_M"],
         config["BLOCK_SIZE_N"],
         config["BLOCK_SIZE_K"],
         config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
+        config["SPLIT_K"],
+        config["BLOCK_SIZE_M"],
+        config["BLOCK_SIZE_N"],
+        config["BLOCK_SIZE_K"],
+        config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
         config["SPLIT_K"],
         mul_routed_weight,
     )
diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index db4b7ca5ef499..711d514a39eb3 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -32,7 +32,6 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
 ###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""  # noqa: E501
 
 EXPECTED_LORA_OUTPUT = [
-    "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
     "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
     "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
     "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
@@ -41,9 +40,6 @@ EXPECTED_LORA_OUTPUT = [
 
 def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
     prompts = [
-        PROMPT_TEMPLATE.format(
-            context="What is the average number of working horses of farms with more than 5000 total number of horses?"  # noqa: E501
-        ),  # noqa: E501
         PROMPT_TEMPLATE.format(
             context="Give the average number of working horses on farms with more than 5000 total horses."  # noqa: E501
         ),  # noqa: E501
@@ -67,7 +63,6 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
 
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 868ca51b33314..12c73f2d79f75 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -56,15 +56,22 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
     )
 
     expected_lora_output = [
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
-        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
-        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
+        [
+            "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])"  # noqa: E501
+        ],
+        [
+            "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
+            "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        ],
+        [
+            "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])"  # noqa: E501
+        ],
     ]
-    assert (
-        do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts)
-        == expected_lora_output
-    )
-    assert (
-        do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts)
-        == expected_lora_output
-    )
+
+    def check_outputs(generated: list[str]):
+        assert len(generated) == len(expected_lora_output)
+        for gen, gt_choices in zip(generated, expected_lora_output):
+            assert gen in gt_choices
+
+    check_outputs(do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts))
+    check_outputs(do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts))
diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
index 6cd1281c36328..72f1d759f1e7a 100644
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -60,6 +60,8 @@ def test_moe_lora_align_block_size(
         (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
     )
     num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
+    adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
 
     # call kernel
     ops.moe_lora_align_block_size(
@@ -73,6 +75,8 @@ def test_moe_lora_align_block_size(
         sorted_token_ids,
         expert_ids,
         num_tokens_post_pad,
+        adapter_enabled,
+        lora_ids,
     )
 
     # verify values
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index b954e0776ca4a..e659c1e1a9a07 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -28,8 +29,17 @@ EXPECTED_LORA_OUTPUT = [
     "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
 ]
 
+EXPECTED_BASE_MODEL_OUTPUT = [
+    "SELECT COUNT(Candidate_ID) FROM candidate",
+    "SELECT COUNT(Candidate_ID) FROM candidate",
+    "SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID",  # noqa: E501
+    "SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1",  # noqa: E501
+]
 
-def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
+
+def generate_and_test(
+    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+) -> None:
     prompts = [
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
         PROMPT_TEMPLATE.format(context="Count the number of candidates."),
@@ -40,12 +50,18 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
             context="Return the poll resource associated with the most candidates."
         ),
     ]
+
+    lora_request = None
+    if isinstance(lora_id, int):
+        lora_request = LoRARequest(str(lora_id), lora_id, lora_path)
+    elif isinstance(lora_id, list):
+        lora_request = [
+            LoRARequest(str(i), i, lora_path) if i is not None else None
+            for i in lora_id
+        ]
+
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
-    )
+    outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -55,7 +71,13 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
     for i in range(len(EXPECTED_LORA_OUTPUT)):
-        assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
+        req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        expected_output = (
+            EXPECTED_LORA_OUTPUT[i]
+            if req_lora_id is not None
+            else EXPECTED_BASE_MODEL_OUTPUT[i]
+        )
+        assert generated_texts[i].startswith(expected_output)
 
 
 def test_olmoe_lora(olmoe_lora_files):
@@ -75,6 +97,20 @@ def test_olmoe_lora(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=2)
 
 
+def test_olmoe_lora_mixed(olmoe_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+
+    generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
+
+
 @multi_gpu_test(num_gpus=2)
 def test_olmoe_lora_tp2(olmoe_lora_files):
     llm = vllm.LLM(
diff --git a/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
index 22bdb3b44eb03..1119d0de1c8b8 100644
--- a/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm import SamplingParams
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import get_model_loader
 
 load_format = "runai_streamer"
 test_model = "openai-community/gpt2"
+# TODO(amacaskill): Replace with a GKE owned GCS bucket.
+test_gcs_model = "gs://vertex-model-garden-public-us/codegemma/codegemma-2b/"
 
 prompts = [
     "Hello, my name is",
@@ -32,3 +36,16 @@ def test_runai_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format=load_format) as llm:
         deserialized_outputs = llm.generate(prompts, sampling_params)
         assert deserialized_outputs
+
+
+def test_runai_model_loader_download_files_gcs(
+    vllm_runner, monkeypatch: pytest.MonkeyPatch
+):
+    monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
+    monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
+    monkeypatch.setenv(
+        "CLOUD_STORAGE_EMULATOR_ENDPOINT", "https://storage.googleapis.com"
+    )
+    with vllm_runner(test_gcs_model, load_format=load_format) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 41419553aa83f..9121284de85b7 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
@@ -15,9 +16,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     dispatch_topk_func,
     vllm_topk_softmax,
 )
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled,
-)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm,
     dispatch_rocm_rmsnorm_func,
@@ -126,50 +124,39 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    topk_func = dispatch_topk_func()
-    is_rocm_aiter_moe_enabled.cache_clear()
-    if current_platform.is_rocm() and int(use_rocm_aiter):
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_topk_softmax,
-        )
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_topk_dispatch(use_rocm_aiter: bool):
+    topk_func = dispatch_topk_func(use_rocm_aiter)
 
-        assert topk_func == rocm_aiter_topk_softmax
+    if current_platform.is_rocm() and use_rocm_aiter:
+        assert topk_func == rocm_aiter_ops.topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
 
 
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter", [True, False])
 @pytest.mark.skipif(
     not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
 )
 def test_rms_norm_dispatch(
-    add_residual: bool,
-    dtype: torch.dtype,
-    use_rocm_aiter: str,
-    use_rocm_aiter_norm: str,
-    monkeypatch,
+    add_residual: bool, dtype: torch.dtype, use_rocm_aiter: bool
 ):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
+    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype, use_rocm_aiter)
 
     should_use_rocm_aiter = (
         current_platform.is_rocm()
-        and int(use_rocm_aiter)
-        and int(use_rocm_aiter_norm)
+        and use_rocm_aiter
         and dtype in RMS_NORM_SUPPORTED_DTYPES
     )
 
     if add_residual and should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        assert rms_norm_func == rocm_aiter_ops.rms_norm2d_with_add
     elif should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
+        assert rms_norm_func == rocm_aiter_ops.rms_norm
     elif add_residual:
         assert rms_norm_func == fused_add_rms_norm
     else:
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index fd2df329f17f9..681b380e6a155 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -19,6 +19,8 @@ pytestmark = pytest.mark.hybrid_model
 # meaning that it will be used in all tests in this file
 # The rest of the models will only be tested by test_models
 
+APC_MULTIPLY_BY = 300
+
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
@@ -380,7 +382,7 @@ def _get_vLLM_output(
     return outs, vllm_model
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -410,10 +412,8 @@ def test_apc_single_prompt(
         check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
     )
 
-    MULTIPLE = 300
-
     # Sample prompts.
-    generated_prompts = [MULTIPLE * example_prompts[0]]
+    generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
 
     max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
     vllm_runner_kwargs = _get_vllm_runner_params(
@@ -446,7 +446,7 @@ def test_apc_single_prompt(
         )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -476,10 +476,8 @@ def test_apc_single_prompt_block_align_alignment(
         check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
     )
 
-    MULTIPLE = 300
-
     # Sample prompts. This custom prompt is used, as it causes the most issues
-    generated_prompts = ["The president of the United States is " * MULTIPLE]
+    generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
 
     max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
     vllm_runner_kwargs = _get_vllm_runner_params(
@@ -528,7 +526,7 @@ def test_apc_single_prompt_block_align_alignment(
             )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -558,10 +556,8 @@ def test_apc_multiple_prompts_all_cached_outputs(
         check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
     )
 
-    MULTIPLE = 300
-
     # Sample prompts.
-    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
 
     max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
     vllm_runner_kwargs = _get_vllm_runner_params(
@@ -595,7 +591,7 @@ def test_apc_multiple_prompts_all_cached_outputs(
         )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -625,12 +621,12 @@ def test_apc_multiple_prompts_block_align_alignment(
         check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
     )
 
-    MULTIPLE = 300
-
     # Sample prompts. This custom prompt is used, as it causes the most issues
     prompt_text = "The president of the United States is "
     prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
-    generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]
+    generated_prompts = [
+        prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
+    ]
 
     max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
     vllm_runner_kwargs = _get_vllm_runner_params(
@@ -679,7 +675,7 @@ def test_apc_multiple_prompts_block_align_alignment(
             )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -709,10 +705,8 @@ def test_apc_multiple_prompts_partial_cached_outputs(
         check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
     )
 
-    MULTIPLE = 300
-
     # Sample prompts.
-    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
 
     max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
     vllm_runner_kwargs = _get_vllm_runner_params(
diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py
index 8929563d8b050..1f5baed83fa62 100644
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -42,6 +42,10 @@ def run_radio_test(
 
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
 
+    # RADIO model on HF does not properly handle torch_dtype argument
+    # And relies on args["dtype"] which we have to patch manually:
+    config.args["dtype"] = torch_dtype
+
     hf_model = AutoModel.from_pretrained(
         model_id,
         config=config,
@@ -50,6 +54,13 @@ def run_radio_test(
     ).to("cuda")
     hf_model.eval()
 
+    # A HF model has image normalization as a part of model's forward
+    # However in vLLM we don't make normalization a part of the model
+    # forward step since mean/std stored as model's parameters and
+    # subject to precision loss (when using fp16/bf16) which negatively
+    # affects evaluation benchmarks.
+    hf_model.make_preprocessor_external()
+
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
     ]
@@ -78,7 +89,7 @@ def run_radio_test(
         "nvidia/C-RADIOv2-H",
     ],
 )
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
     run_radio_test(
         image_assets,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8e1dd4ba91f1d..8f19a048677ec 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -219,7 +219,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
         trust_remote_code=True,
     ),
-    "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
+    "DeepseekForCausalLM": _HfExamplesInfo(
+        "deepseek-ai/deepseek-moe-16b-base",
+        trust_remote_code=True,
+    ),
     "DeepseekV2ForCausalLM": _HfExamplesInfo(
         "deepseek-ai/DeepSeek-V2-Lite-Chat",
         trust_remote_code=True,
@@ -363,6 +366,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
     "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
+    "OpenPanguMTPModel": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "OPTForCausalLM": _HfExamplesInfo(
         "facebook/opt-125m", {"1b": "facebook/opt-iml-max-1.3b"}
     ),
@@ -370,6 +378,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         "OrionStarAI/Orion-14B-Chat", trust_remote_code=True
     ),
     "OuroForCausalLM": _HfExamplesInfo("ByteDance/Ouro-1.4B", trust_remote_code=True),
+    "PanguEmbeddedForCausalLM": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Embedded-7B-V1.1", trust_remote_code=True
+    ),
+    "PanguUltraMoEForCausalLM": _HfExamplesInfo(
+        "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
     "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
     "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
@@ -378,8 +394,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "Plamo2ForCausalLM": _HfExamplesInfo(
         "pfnet/plamo-2-1b",
-        max_transformers_version="4.55.4",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
         trust_remote_code=True,
     ),
     "QWenLMHeadModel": _HfExamplesInfo(
@@ -712,6 +726,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         },
     ),
     "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
+    "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
+        "PaddlePaddle/PaddleOCR-VL",
+        trust_remote_code=True,
+    ),
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo(
         "google/paligemma-3b-mix-224",
         extras={"v2": "google/paligemma2-3b-ft-docci-448"},
@@ -882,27 +900,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 
 _TRANSFORMERS_BACKEND_MODELS = {
     "TransformersEmbeddingModel": _HfExamplesInfo(
-        "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
+        "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
     ),
     "TransformersForSequenceClassification": _HfExamplesInfo(
         "papluca/xlm-roberta-base-language-detection",
-        min_transformers_version="5.0.0",
+        min_transformers_version="5.0.0.dev",
     ),
     "TransformersForCausalLM": _HfExamplesInfo(
         "hmellor/Ilama-3.2-1B", trust_remote_code=True
     ),
     "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
     "TransformersMoEForCausalLM": _HfExamplesInfo(
-        "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
+        "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
     ),
     "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
     ),
     "TransformersMoEEmbeddingModel": _HfExamplesInfo(
-        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
     ),
     "TransformersMoEForSequenceClassification": _HfExamplesInfo(
-        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
     ),
     "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
     "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 06e51df32d184..a18f5b6077636 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -82,7 +82,7 @@ def test_models(
     from packaging.version import Version
 
     installed = Version(transformers.__version__)
-    required = Version("5.0.0")
+    required = Version("5.0.0.dev")
     if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
         pytest.skip(
             "MoE models with the Transformers backend require "
diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py
index 3cae6f46147bf..8dd4551ff4b96 100644
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -172,21 +172,9 @@ def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch
     can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
 
 
-def test_gptoss_dp2_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
-    monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
+def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):
     can_initialize(
         "openai/gpt-oss-20b",
-        extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
-        hf_overrides=HF_OVERRIDE_TEXT,
-    )
-
-
-def test_gptoss_dp2_mxfp4bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
-    monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
-    can_initialize(
-        "openai/gpt-oss-20b",
-        extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
         hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=["--enforce-eager"],
     )
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index bb3572752d9e2..f02da2996ffea 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -18,7 +18,12 @@ from vllm.platforms import current_platform
 
 MODELS = [
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    # The checkpoint below was removed from the HF.
+    # TODO: add a small replacement checkpoint.
+    pytest.param(
+        "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+        marks=pytest.mark.skip(reason="Checkpoint removed from HF."),
+    ),
 ]
 
 
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index cab198a2a15e2..82413f36e997f 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -99,7 +99,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
-def test_on_the_fly_quant_config_dict_json(vllm_runner):
+def test_online_quant_config_dict_json(vllm_runner):
     """Testing on the fly quantization, load_weights integration point,
     with config dict serialized to json string
     """
@@ -133,7 +133,7 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
 
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
-def test_on_the_fly_quant_config_file(vllm_runner):
+def test_online_quant_config_file(vllm_runner):
     """Testing on the fly quantization, load_weights integration point,
     with config file
     """
@@ -252,6 +252,148 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
     ) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
+    assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypatch):
+    """We load a model with Int4Tensor (plain format) linear weights
+    and verify that the weight is updated to Int4PreshuffledTensor
+    after loading in vllm
+    """
+    from torchao.quantization import Int4PreshuffledTensor
+    from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
+
+    torch._dynamo.reset()
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+    model_name = "torchao-testing/opt-125m-Int4WeightOnlyConfig-v2-0.14.0.dev"
+    # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
+    # have meta kernel implemented yet, can remove this flag after that is implemented
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        enforce_eager=True,
+    ) as llm:
+
+        def has_int4_preshuffled_tensor_weight(model):
+            return isinstance(
+                model.model.decoder.layers[0].self_attn.qkv_proj.weight,
+                Int4PreshuffledTensor,
+            )
+
+        def get_weight_attrs(model):
+            weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
+            return [
+                weight.requires_grad,
+                weight.input_dim,
+                weight.output_dim,
+                hasattr(weight, "weight_loader"),
+            ]
+
+        llm_engine = llm.get_llm().llm_engine
+        has_int4_preshuffled_tensor = any(
+            llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
+        )
+        weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
+
+        # making sure we are using Int4PreshuffledTensor on H100 GPU, when
+        # fbgemm_gpu_genai
+        # library is installed, otherwise it should be using Int4Tensor
+        if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
+            assert has_int4_preshuffled_tensor
+        else:
+            assert not has_int4_preshuffled_tensor
+
+        assert weight_attrs == [False, 1, 0, True]
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+
+        assert output
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
+    vllm_runner, monkeypatch
+):
+    """We load a bf16 model and online quantize the model to int4, then verify that
+    the weights are updated to Int4PreshuffledTensor after online quantization
+    """
+    from torchao.quantization import Int4PreshuffledTensor
+    from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
+
+    torch._dynamo.reset()
+    model_name = "facebook/opt-125m"
+
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+    import json
+
+    from torchao.core.config import config_to_dict
+    from torchao.quantization import Int4WeightOnlyConfig
+
+    torchao_quant_config = Int4WeightOnlyConfig(
+        group_size=128, int4_packing_format="plain"
+    )
+    hf_overrides = {
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
+    }
+
+    # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
+    # have meta kernel implemented yet, can remove this flag after that is implemented
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        hf_overrides=hf_overrides,
+        enforce_eager=True,
+    ) as llm:
+
+        def has_int4_preshuffled_tensor_weight(model):
+            return isinstance(
+                model.model.decoder.layers[0].self_attn.qkv_proj.weight,
+                Int4PreshuffledTensor,
+            )
+
+        def get_weight_attrs(model):
+            weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
+            return [
+                weight.requires_grad,
+                weight.input_dim,
+                weight.output_dim,
+                hasattr(weight, "weight_loader"),
+            ]
+
+        llm_engine = llm.get_llm().llm_engine
+        has_int4_preshuffled_tensor = any(
+            llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
+        )
+        weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
+
+        # making sure we are using Int4PreshuffledTensor on H100 GPU, when
+        # fbgemm_gpu_genai
+        # library is installed, otherwise it should be using Int4Tensor
+        if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
+            assert has_int4_preshuffled_tensor
+        else:
+            assert not has_int4_preshuffled_tensor
+
+        assert weight_attrs == [False, 1, 0, True]
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+
         assert output
 
 
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index ddda50fe770a6..d31b1c7d169b7 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -151,57 +151,57 @@ class TestBaseThinkingReasoningParserMethods:
 class TestBaseThinkingReasoningParserExtraction:
     """Test reasoning content extraction methods."""
 
-    def test_extract_reasoning_content_with_both_tokens(self, test_tokenizer):
+    def test_extract_reasoning_with_both_tokens(self, test_tokenizer):
         """Test extraction when both start and end tokens are present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = "<test:think>This is reasoning</test:think>This is content"
-        reasoning, content = parser.extract_reasoning_content(model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is reasoning"
         assert content == "This is content"
 
-    def test_extract_reasoning_content_only_end_token(self, test_tokenizer):
+    def test_extract_reasoning_only_end_token(self, test_tokenizer):
         """Test extraction when only end token is present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = "This is reasoning</test:think>This is content"
-        reasoning, content = parser.extract_reasoning_content(model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is reasoning"
         assert content == "This is content"
 
-    def test_extract_reasoning_content_no_end_token(self, test_tokenizer):
+    def test_extract_reasoning_no_end_token(self, test_tokenizer):
         """Test extraction when no end token is present."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = "This is just content"
-        reasoning, content = parser.extract_reasoning_content(model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == "This is just content"
         assert content is None
 
-    def test_extract_reasoning_content_empty_output(self, test_tokenizer):
+    def test_extract_reasoning_empty_output(self, test_tokenizer):
         """Test extraction with empty output."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = ""
-        reasoning, content = parser.extract_reasoning_content(model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == ""
         assert content is None
 
-    def test_extract_reasoning_content_only_tokens(self, test_tokenizer):
+    def test_extract_reasoning_only_tokens(self, test_tokenizer):
         """Test extraction with only tokens and no content."""
         parser = TestThinkingReasoningParser(test_tokenizer)
         request = ChatCompletionRequest(messages=[], model="test-model")
 
         model_output = "<test:think></test:think>"
-        reasoning, content = parser.extract_reasoning_content(model_output, request)
+        reasoning, content = parser.extract_reasoning(model_output, request)
 
         assert reasoning == ""
         assert content is None
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 946d01c123c5d..91f0c93653d32 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -21,97 +21,97 @@ def deepseek_r1_qwen_tokenizer():
 
 SIMPLE_REASONING = {
     "output": "This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat</think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "</think>This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "</think>This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "<think>This\nThat</think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "</think>This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "</think>This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 THINK_NO_END = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY = {
     "output": "",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
     "output": "",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
     "is_reasoning_end": False,
 }
 NEW_LINE = {
     "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -121,7 +121,7 @@ NEW_LINE = {
 # or not.
 NEW_LINE_STREAMING = {
     "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
-    "reasoning_content": "\nThis is a reasoning section",
+    "reasoning": "\nThis is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -269,7 +269,7 @@ def test_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
     # Test is_reasoning_end
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
index 3d12f3e5b30e8..6e8f0e8dcc9b9 100644
--- a/tests/reasoning/test_deepseekv3_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -5,11 +5,9 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
-from vllm.reasoning import (
-    DeepSeekR1ReasoningParser,
-    DeepSeekV3ReasoningParser,
-    IdentityReasoningParser,
-)
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
 
 REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1"
 
@@ -46,14 +44,14 @@ def test_identity_reasoning_parser_basic(tokenizer):
     # Test extract_content_ids returns all input_ids
     assert parser.extract_content_ids(input_ids) == input_ids
 
-    # Test extract_reasoning_content returns (None, model_output)
+    # Test extract_reasoning returns (None, model_output)
     request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
-    reasoning, content = parser.extract_reasoning_content(input_text, request)
+    reasoning, content = parser.extract_reasoning(input_text, request)
     assert reasoning is None
     assert content == input_text
 
-    # Test extract_reasoning_content_streaming returns DeltaMessage or None
-    result = parser.extract_reasoning_content_streaming(
+    # Test extract_reasoning_streaming returns DeltaMessage or None
+    result = parser.extract_reasoning_streaming(
         previous_text="",
         current_text="Hello world",
         delta_text="Hello world",
@@ -65,7 +63,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
     assert result.content == "Hello world"
 
     # If delta_text is empty, should return None
-    result_none = parser.extract_reasoning_content_streaming(
+    result_none = parser.extract_reasoning_streaming(
         previous_text="Hello world",
         current_text="Hello world",
         delta_text="",
diff --git a/tests/reasoning/test_ernie45_reasoning_parser.py b/tests/reasoning/test_ernie45_reasoning_parser.py
index 344478013e6b4..dbf5507ae68ba 100644
--- a/tests/reasoning/test_ernie45_reasoning_parser.py
+++ b/tests/reasoning/test_ernie45_reasoning_parser.py
@@ -20,36 +20,36 @@ def ernie45_tokenizer():
 # 带 </think>，非stream
 WITH_THINK = {
     "output": "abc</think>def",
-    "reasoning_content": "abc",
+    "reasoning": "abc",
     "content": "def",
 }
 # 带 </think>，stream
 WITH_THINK_STREAM = {
     "output": "abc</think>def",
-    "reasoning_content": "abc",
+    "reasoning": "abc",
     "content": "def",
 }
-# without </think>, all is reasoning_content
+# without </think>, all is reasoning
 WITHOUT_THINK = {
     "output": "abc",
-    "reasoning_content": "abc",
+    "reasoning": "abc",
     "content": None,
 }
-# without </think>, all is reasoning_content
+# without </think>, all is reasoning
 WITHOUT_THINK_STREAM = {
     "output": "abc",
-    "reasoning_content": "abc",
+    "reasoning": "abc",
     "content": None,
 }
 
 COMPLETE_REASONING = {
     "output": "abc</think>",
-    "reasoning_content": "abc",
+    "reasoning": "abc",
     "content": None,
 }
 MULTILINE_REASONING = {
     "output": "abc\nABC</think>def\nDEF",
-    "reasoning_content": "abc\nABC",
+    "reasoning": "abc\nABC",
     "content": "def\nDEF",
 }
 
@@ -120,5 +120,5 @@ def test_reasoning(
 
     print()
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_glm4_moe_reasoning_parser.py b/tests/reasoning/test_glm4_moe_reasoning_parser.py
index 0a8595a00fcb5..6f7827e5b8277 100644
--- a/tests/reasoning/test_glm4_moe_reasoning_parser.py
+++ b/tests/reasoning/test_glm4_moe_reasoning_parser.py
@@ -21,54 +21,54 @@ def glm45_tokenizer():
 
 WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 
 WITH_THINK_STREAM = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 
 WITHOUT_THINK = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": False,
 }
 
 WITHOUT_THINK_STREAM = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": False,
 }
 
 COMPLETE_REASONING = {
     "output": "<think>This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 MULTILINE_REASONING = {
     "output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
-    "reasoning_content": "This is a reasoning\nsection",
+    "reasoning": "This is a reasoning\nsection",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 ONLY_OPEN_TAG = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "<think>This is a reasoning section",
     "is_reasoning_end": False,
 }
 
 ONLY_OPEN_TAG_STREAM = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
@@ -184,7 +184,7 @@ def test_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
     output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py
new file mode 100644
index 0000000000000..873135d5717f8
--- /dev/null
+++ b/tests/reasoning/test_gptoss_reasoning_parser.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.gptoss_reasoning_parser import GptOssReasoningParser
+
+REASONING_MODEL_NAME = "openai/gpt-oss-120b"
+
+
+@pytest.fixture(scope="module")
+def gpt_oss_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+USER_MESSAGE_START = "<|start|>user<|message|>"
+REASONING_SECTION_START = "<|end|><|start|>assistant<|channel|>analysis<|message|>"
+ASSISTANT_CONTENT_START_PREFIX = "<|end|><|start|>assistant<|channel|>final"
+ASSISTANT_CONTENT_START_SUFFIX = "<|message|>"
+ASSISTANT_CONTENT_START = (
+    ASSISTANT_CONTENT_START_PREFIX + ASSISTANT_CONTENT_START_SUFFIX
+)
+
+BASIC_CONTENT = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+BASIC_REASONING_ONLY = {
+    "output": REASONING_SECTION_START + "This is reasoning" + "<|end|>",
+    "is_reasoning_end": False,
+}
+BASIC_NO_REASONING_NO_ASSISTANT = {
+    "output": USER_MESSAGE_START + "This is a user message",
+    "is_reasoning_end": False,
+}
+
+# Edge-case where the model omits the assistant tag entirely.
+BASIC_NO_REASONING_ASSISTANT = {
+    "output": USER_MESSAGE_START + "This is a user message<|end|><|channel|>final",
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX,
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_SUFFIX_ONLY = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_SUFFIX,
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_1_NO_SUFFIX = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON ",
+    "is_reasoning_end": False,
+}
+
+COMPLEX_CONTENT_1 = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON "
+    + ASSISTANT_CONTENT_START_SUFFIX,
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_1_WITH_CONTENT = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|> JSON "
+    + ASSISTANT_CONTENT_START_SUFFIX
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+COMPLEX_CONTENT_2 = {
+    "output": REASONING_SECTION_START
+    + "This is reasoning"
+    + ASSISTANT_CONTENT_START_PREFIX
+    + "<|constrain|>ReplyAction "
+    + ASSISTANT_CONTENT_START_SUFFIX
+    + "This is the rest",
+    "is_reasoning_end": True,
+}
+
+TEST_CASES = [
+    BASIC_CONTENT,
+    BASIC_REASONING_ONLY,
+    COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY,
+    COMPLEX_CONTENT_SUFFIX_ONLY,
+    COMPLEX_CONTENT_1_NO_SUFFIX,
+    COMPLEX_CONTENT_1,
+    COMPLEX_CONTENT_1_WITH_CONTENT,
+    COMPLEX_CONTENT_2,
+]
+
+
+@pytest.mark.parametrize(
+    "output, is_reasoning_end",
+    [(t["output"], t["is_reasoning_end"]) for t in TEST_CASES],
+)
+def test_gptoss_is_reasoning_end(
+    output,
+    is_reasoning_end,
+    gpt_oss_tokenizer,
+):
+    output = gpt_oss_tokenizer.tokenize(output)
+    parser: ReasoningParser = GptOssReasoningParser(gpt_oss_tokenizer)
+
+    # Test is_reasoning_end
+    output_ids = gpt_oss_tokenizer.convert_tokens_to_ids(output)
+    actual_is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == actual_is_reasoning_end
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index de1663408d72d..14aad3ad08189 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -12,37 +12,37 @@ START_RESPONSE = "Here is my response:"
 
 SIMPLE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 NO_REASONING = {
     "output": "This is content",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is content",
 }
 MULTIPLE_LINES = {
     "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 
@@ -141,7 +141,7 @@ def test_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
 
@@ -155,7 +155,7 @@ STREAMING_1 = {
     "previous_text": None,
     "current_text": "Here",
     "delta_text": "Here",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # When we fail, we should give what was previously being silenced first
@@ -163,7 +163,7 @@ STREAMING_2 = {
     "previous_text": "Here is my thought",
     "current_text": "Here is my thought failure",
     "delta_text": " failure",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "Here is my thought failure",
 }
 # But then after the first one, we should only add the delta text to content
@@ -171,7 +171,7 @@ STREAMING_3 = {
     "previous_text": "Here wrong",
     "current_text": " words",
     "delta_text": " Here wrong words",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": " words",
 }
 # But then after the first one, we should only add the delta text to content
@@ -179,7 +179,7 @@ STREAMING_4 = {
     "previous_text": "Here is my thought",
     "current_text": "Here is my thought process:",
     "delta_text": " process:",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # Reasoning started successfully; parse reasoning content
@@ -187,7 +187,7 @@ STREAMING_5 = {
     "previous_text": "Here is my thought process:",
     "current_text": "Here is my thought process: foo",
     "delta_text": " foo",
-    "reasoning_content": " foo",
+    "reasoning": " foo",
     "content": None,
 }
 # Response special sequence has started, but not finished.
@@ -195,7 +195,7 @@ STREAMING_6 = {
     "previous_text": "Here is my thought process: foo",
     "current_text": "Here is my thought process: foo Here is",
     "delta_text": " Here is",
-    "reasoning_content": " ",
+    "reasoning": " ",
     "content": None,
 }
 # Response special sequence started, but was broken; the reasoning
@@ -204,7 +204,7 @@ STREAMING_7 = {
     "previous_text": "Here is my thought process: foo Here is",
     "current_text": "Here is my thought process: foo Here is Here",
     "delta_text": " Here",
-    "reasoning_content": "Here is ",
+    "reasoning": "Here is ",
     "content": None,
 }
 # Response special sequence is ongoing
@@ -212,7 +212,7 @@ STREAMING_8 = {
     "previous_text": "Here is my thought process: foo Here is my response:",
     "current_text": "Here is my thought process: foo Here is my response: bar",
     "delta_text": " bar",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": " bar",
 }
 # The delta text has everything; we should be able to correctly parse both
@@ -220,7 +220,7 @@ STREAMING_9 = {
     "previous_text": None,
     "current_text": "Here is my thought process: foo Here is my response: bar",
     "delta_text": "Here is my thought process: foo Here is my response: bar",
-    "reasoning_content": " foo ",
+    "reasoning": " foo ",
     "content": " bar",
 }
 ## The Response is ongoing, and the delta mixes reasoning content / content
@@ -228,7 +228,7 @@ STREAMING_10 = {
     "previous_text": "Here is my thought process: foo",
     "current_text": "Here is my thought process: foo bar Here is my response: baz",
     "delta_text": " bar Here is my response: baz",
-    "reasoning_content": " bar ",
+    "reasoning": " bar ",
     "content": " baz",
 }
 # The delta text starts a new substring that might be a response special seq
@@ -236,7 +236,7 @@ STREAMING_11 = {
     "previous_text": "Here is my thought process: This is a reasoning section ",
     "current_text": "Here is my thought process: This is a reasoning section Here",
     "delta_text": "Here",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 # The delta text is finishing the response special seq
@@ -244,14 +244,14 @@ STREAMING_12 = {
     "previous_text": "Here is my thought process: foo Here is my response",
     "current_text": "Here is my thought process: foo Here is my response:",
     "delta_text": ":",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
 }
 STREAMING_13 = {
     "previous_text": "Here is my thought process: foo Here",
     "current_text": "Here is my thought process: foo Here was",
     "delta_text": " was",
-    "reasoning_content": "Here was",
+    "reasoning": "Here was",
     "content": None,
 }
 
@@ -326,7 +326,7 @@ def test_streaming_subcases(param_dict):
         tokenizer
     )
 
-    response = parser.extract_reasoning_content_streaming(
+    response = parser.extract_reasoning_streaming(
         previous_text=param_dict["previous_text"],
         current_text=param_dict["current_text"],
         delta_text=param_dict["delta_text"],
@@ -336,9 +336,9 @@ def test_streaming_subcases(param_dict):
     )
     # Streaming currently expects at least one of reasoning content / content,
     # so the response should return None in that case.
-    if param_dict["reasoning_content"] is None and param_dict["content"] is None:
+    if param_dict["reasoning"] is None and param_dict["content"] is None:
         assert response is None
     else:
         assert isinstance(response, DeltaMessage)
-        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["reasoning"] == response.reasoning
         assert param_dict["content"] == response.content
diff --git a/tests/reasoning/test_hunyuan_reasoning_parser.py b/tests/reasoning/test_hunyuan_reasoning_parser.py
index b7e3ea73ccdef..32e753d2abb72 100644
--- a/tests/reasoning/test_hunyuan_reasoning_parser.py
+++ b/tests/reasoning/test_hunyuan_reasoning_parser.py
@@ -14,49 +14,49 @@ END_RESPONSE = "\n</answer>"
 
 NO_REASONING_QUICK_THROUGHT = {
     "output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
 }
 
 SIMPLE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 
 COMPLETE_REASONING_WITH_SYMBOL = {
     "output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section!",
+    "reasoning": "This is a reasoning section!",
     "content": None,
 }
 NO_REASONING = {
     "output": "This is content",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is content",
 }
 MULTIPLE_LINES = {
     "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
 }
 
@@ -164,5 +164,5 @@ def test_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index ff7f94b40ee11..5163c863863a7 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -20,97 +20,97 @@ def mistral_tokenizer():
 
 SIMPLE_REASONING = {
     "output": "This is a reasoning section[/THINK]This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section[/THINK]",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat[/THINK]This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "[THINK]This is a reasoning section[/THINK]",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "[THINK]This\nThat[/THINK]This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "[/THINK]This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 THINK_NO_END = {
     "output": "[THINK]This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY = {
     "output": "",
-    "reasoning_content": "",
+    "reasoning": "",
     "content": None,
     "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
     "output": "",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": None,
     "is_reasoning_end": False,
 }
 NEW_LINE = {
     "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -120,7 +120,7 @@ NEW_LINE = {
 # or not.
 NEW_LINE_STREAMING = {
     "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning_content": "\nThis is a reasoning section",
+    "reasoning": "\nThis is a reasoning section",
     "content": "\nThis is the rest",
     "is_reasoning_end": True,
 }
@@ -307,7 +307,7 @@ def test_mistral_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
 
     # Test is_reasoning_end
diff --git a/tests/reasoning/test_olmo3_reasoning_parser.py b/tests/reasoning/test_olmo3_reasoning_parser.py
index 4a2eca994610e..bc0e72e2a4563 100644
--- a/tests/reasoning/test_olmo3_reasoning_parser.py
+++ b/tests/reasoning/test_olmo3_reasoning_parser.py
@@ -13,43 +13,43 @@ END_REASONING = "</think>"
 
 NO_REASONING = {
     "output": f"{START_REASONING}{END_REASONING}No thoughts, head empty!",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "No thoughts, head empty!",
 }
 
 NO_REASONING_WITH_NEWLINE = {
     "output": f"{START_REASONING}\n{END_REASONING}\n\nNo thoughts, head empty!",
-    "reasoning_content": "\n",
+    "reasoning": "\n",
     "content": "\n\nNo thoughts, head empty!",
 }
 
 SIMPLE_REASONING = {
     "output": f"{START_REASONING}This is a reasoning section{END_REASONING}This is the rest",  # noqa: E501
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 
 SIMPLE_REASONING_WITH_NEWLINE = {
     "output": f"{START_REASONING} Look!\n\nI'm thinking...{END_REASONING}\nThis is the rest",  # noqa: E501
-    "reasoning_content": " Look!\n\nI'm thinking...",
+    "reasoning": " Look!\n\nI'm thinking...",
     "content": "\nThis is the rest",
 }
 
 SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = {
     "output": f"{START_REASONING}\nLook!\nI'm thinking...\n\n{END_REASONING}\n\n\nThis is the rest",  # noqa: E501
-    "reasoning_content": "\nLook!\nI'm thinking...\n\n",
+    "reasoning": "\nLook!\nI'm thinking...\n\n",
     "content": "\n\n\nThis is the rest",
 }
 
 NO_REASONING_ONLY_END_THINK = {
     "output": f"{END_REASONING}\n\nNo thoughts, head empty!",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "\n\nNo thoughts, head empty!",
 }
 
 REASONING_ONLY_END_THINK = {
     "output": f"The user is asking me not to think.{END_REASONING}No thoughts!",
-    "reasoning_content": "The user is asking me not to think.",
+    "reasoning": "The user is asking me not to think.",
     "content": "No thoughts!",
 }
 
@@ -148,5 +148,5 @@ def test_reasoning(
         reasoning_parser=parser, model_output=model_output, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index c06e40d72de2c..92a8b6ab37615 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -22,47 +22,47 @@ def qwen3_tokenizer():
 # 带 <think></think>，非stream
 WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 # 带 <think></think>，stream
 WITH_THINK_STREAM = {
     "output": "<think>This is a reasoning section</think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
 }
 # 不带 <think></think>，非stream
 WITHOUT_THINK = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
 }
 # 不带 <think></think>，stream
 WITHOUT_THINK_STREAM = {
     "output": "This is the rest",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "This is the rest",
 }
 
 COMPLETE_REASONING = {
     "output": "<think>This is a reasoning section</think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 MULTILINE_REASONING = {
     "output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
-    "reasoning_content": "This is a reasoning\nsection",
+    "reasoning": "This is a reasoning\nsection",
     "content": "This is the rest\nThat",
 }
 ONLY_OPEN_TAG = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": None,
+    "reasoning": None,
     "content": "<think>This is a reasoning section",
 }
 
 ONLY_OPEN_TAG_STREAM = {
     "output": "<think>This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
 }
 
@@ -138,5 +138,5 @@ def test_reasoning(
         parser, output_tokens, streaming=streaming
     )
 
-    assert reasoning == param_dict["reasoning_content"]
+    assert reasoning == param_dict["reasoning"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_seedoss_reasoning_parser.py b/tests/reasoning/test_seedoss_reasoning_parser.py
index b356b8545f412..33d56d32965a0 100644
--- a/tests/reasoning/test_seedoss_reasoning_parser.py
+++ b/tests/reasoning/test_seedoss_reasoning_parser.py
@@ -28,49 +28,49 @@ def seedoss_tokenizer():
 
 SIMPLE_REASONING: dict[str, Any] = {
     "output": "This is a reasoning section</seed:think>This is the rest",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 COMPLETE_REASONING: dict[str, Any] = {
     "output": "This is a reasoning section</seed:think>",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": True,
 }
 NO_CONTENT: dict[str, Any] = {
     "output": "This is content",
-    "reasoning_content": "This is content",
+    "reasoning": "This is content",
     "content": None,
     "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING: dict[str, Any] = {
     "output": "This is a reasoning section",
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": None,
     "is_reasoning_end": False,
 }
 MULTIPLE_LINES: dict[str, Any] = {
     "output": "This\nThat</seed:think>This is the rest\nThat",
-    "reasoning_content": "This\nThat",
+    "reasoning": "This\nThat",
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
 WITH_START_TOKEN: dict[str, Any] = {
     "output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
-    "reasoning_content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 ONLY_END_TOKEN: dict[str, Any] = {
     "output": "Some reasoning</seed:think>This is the rest",
-    "reasoning_content": "Some reasoning",
+    "reasoning": "Some reasoning",
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
 NO_TOKENS: dict[str, Any] = {
     "output": "This is just content without any reasoning tokens",
-    "reasoning_content": "This is just content without any reasoning tokens",
+    "reasoning": "This is just content without any reasoning tokens",
     "content": None,
     "is_reasoning_end": False,
 }
@@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming):
         parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
     )
 
-    assert reasoning == SIMPLE_REASONING["reasoning_content"]
+    assert reasoning == SIMPLE_REASONING["reasoning"]
     assert content == SIMPLE_REASONING["content"]
 
 
@@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming):
         parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
     )
 
-    assert reasoning == COMPLETE_REASONING["reasoning_content"]
+    assert reasoning == COMPLETE_REASONING["reasoning"]
     assert content == COMPLETE_REASONING["content"]
 
 
@@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming):
         parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
     )
 
-    assert reasoning == NO_CONTENT["reasoning_content"]
+    assert reasoning == NO_CONTENT["reasoning"]
     assert content == NO_CONTENT["content"]
 
 
@@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming):
         parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
     )
 
-    assert reasoning == MULTIPLE_LINES["reasoning_content"]
+    assert reasoning == MULTIPLE_LINES["reasoning"]
     assert content == MULTIPLE_LINES["content"]
 
 
@@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming):
         parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
     )
 
-    assert reasoning == WITH_START_TOKEN["reasoning_content"]
+    assert reasoning == WITH_START_TOKEN["reasoning"]
     assert content == WITH_START_TOKEN["content"]
 
 
@@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming):
         parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
     )
 
-    assert reasoning == ONLY_END_TOKEN["reasoning_content"]
+    assert reasoning == ONLY_END_TOKEN["reasoning"]
     assert content == ONLY_END_TOKEN["content"]
 
 
@@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming):
         parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
     )
 
-    assert reasoning == NO_TOKENS["reasoning_content"]
+    assert reasoning == NO_TOKENS["reasoning"]
     assert content == NO_TOKENS["content"]
 
 
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index ccd4ff8dd263a..bd0b230a847cb 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 class StreamingReasoningReconstructor:
     def __init__(self):
-        self.reasoning_content = None
+        self.reasoning = None
         self.other_content = None
 
     def append_delta(self, delta: DeltaMessage):
         # content and the reasoning content should not be present
         # at the same time
-        assert delta.content is None or delta.reasoning_content is None, (
+        assert delta.content is None or delta.reasoning is None, (
             "Both content and reasoning content are present in the delta message"
         )
+        assert delta.reasoning == delta.reasoning_content, (
+            "reasoning_content should be present for backwards compatibility"
+        )
         if delta.content is not None:
             if self.other_content is None:
                 self.other_content = delta.content
             else:
                 self.other_content += delta.content
         else:
-            if self.reasoning_content is None:
-                self.reasoning_content = delta.reasoning_content
+            if self.reasoning is None:
+                self.reasoning = delta.reasoning
             else:
-                self.reasoning_content += delta.reasoning_content
+                self.reasoning += delta.reasoning
 
 
 def run_reasoning_extraction(
@@ -43,7 +46,7 @@ def run_reasoning_extraction(
             request,
         )
         return (
-            reconstructor.reasoning_content,
+            reconstructor.reasoning,
             reconstructor.other_content or None,
         )
     else:
@@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral(
             request,
         )
         return (
-            reconstructor.reasoning_content,
+            reconstructor.reasoning,
             reconstructor.other_content or None,
         )
     else:
@@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming(
     request: ChatCompletionRequest | None = None,
 ) -> tuple[str | None, str | None]:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
-    return reasoning_parser.extract_reasoning_content(
+    return reasoning_parser.extract_reasoning(
         model_output="".join(model_output), request=request
     )
 
@@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming(
         ]
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
-        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+        delta_message = reasoning_parser.extract_reasoning_streaming(
             previous_text,
             current_text,
             delta,
@@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral(
         delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
-        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+        delta_message = reasoning_parser.extract_reasoning_streaming(
             previous_text,
             current_text,
             delta,
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
new file mode 100644
index 0000000000000..c9d227599cde5
--- /dev/null
+++ b/tests/samplers/test_logprobs.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.logprobs import FlatLogprobs
+
+MODELS = ["distilbert/distilgpt2"]
+MAX_TOKENS = 5
+NUM_TOP_LOGPROBS = 5
+NUM_PROMPT_LOGPROBS = 7
+MAX_LOGPROBS = max(NUM_TOP_LOGPROBS, NUM_PROMPT_LOGPROBS)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("greedy", [True, False])
+@pytest.mark.parametrize("flat_logprobs", [True, False])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    greedy,
+    flat_logprobs,
+    example_prompts,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0")
+    with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
+        sampling_params = SamplingParams(
+            temperature=0.0 if greedy else 1.0,
+            top_p=1.0,
+            max_tokens=MAX_TOKENS,
+            logprobs=NUM_TOP_LOGPROBS,
+            prompt_logprobs=NUM_PROMPT_LOGPROBS,
+        )
+        results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    assert len(results) == len(example_prompt_tokens)
+    for i, (result, prompt_tokens) in enumerate(zip(results, example_prompt_tokens)):
+        decode_tokens, _, decode_logprobs, prompt_logprobs = result
+
+        # Ensure the return type of logprobs is accurate
+        assert isinstance(prompt_logprobs, FlatLogprobs if flat_logprobs else list)
+        assert isinstance(decode_logprobs, FlatLogprobs if flat_logprobs else list)
+
+        ########################
+        # Check prompt logprobs
+        ########################
+        assert len(prompt_tokens) == len(prompt_logprobs)
+        # No logprob for first prompt token
+        assert not prompt_logprobs[0]
+        for position, (token, logprobs) in enumerate(
+            zip(prompt_tokens[1:], prompt_logprobs[1:]), start=1
+        ):
+            # Ensure logprobs of prompt token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_PROMPT_LOGPROBS or NUM_PROMPT_LOGPROBS+1
+            assert NUM_PROMPT_LOGPROBS <= len(logprobs) <= NUM_PROMPT_LOGPROBS + 1
+            # Ensure top NUM_PROMPT_LOGPROBS is always extracted
+            assert set(range(1, NUM_PROMPT_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
+
+        ########################
+        # Check sample logprobs
+        ########################
+        assert len(decode_tokens) == len(decode_logprobs)
+        for position, (token, logprobs) in enumerate(
+            zip(decode_tokens, decode_logprobs)
+        ):
+            # Ensure logprobs of chosen token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            if greedy:
+                # For greedy sampling, all chosen logprob should be top ranked
+                assert logprob.rank == 1
+            else:
+                assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_TOP_LOGPROBS or NUM_TOP_LOGPROBS+1
+            assert NUM_TOP_LOGPROBS <= len(logprobs) <= NUM_TOP_LOGPROBS + 1
+            # Ensure top NUM_TOP_LOGPROBS logprobs is always extracted
+            assert set(range(1, NUM_TOP_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
deleted file mode 100644
index 1359e6403e4c3..0000000000000
--- a/tests/samplers/test_ranks.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_ranks(
-    vllm_runner,
-    model,
-    dtype,
-    example_prompts,
-):
-    max_tokens = 5
-    num_top_logprobs = 5
-    num_prompt_logprobs = 5
-
-    with vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) as vllm_model:
-        ## Test greedy logprobs ranks
-        vllm_sampling_params = SamplingParams(
-            temperature=0.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs,
-        )
-        vllm_results = vllm_model.generate_w_logprobs(
-            example_prompts, vllm_sampling_params
-        )
-
-        ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(
-            temperature=1.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs,
-        )
-        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
-
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
-
-    for result in res:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks
-        for token, logprobs in zip(result[0], result[2]):
-            assert logprobs[token].rank >= 1
diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py
new file mode 100644
index 0000000000000..d26a460d2bcab
--- /dev/null
+++ b/tests/test_logprobs.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.logprobs import (
+    FlatLogprobs,
+    Logprob,
+    LogprobsOnePosition,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
+
+
+def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, list)
+    # Ensure first prompt position logprobs is None
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] is None
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, list)
+    assert len(sample_logprobs) == 0
+
+
+def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, FlatLogprobs)
+    assert prompt_logprobs.start_indices == [0]
+    assert prompt_logprobs.end_indices == [0]
+    assert len(prompt_logprobs.token_ids) == 0
+    assert len(prompt_logprobs.logprobs) == 0
+    assert len(prompt_logprobs.ranks) == 0
+    assert len(prompt_logprobs.decoded_tokens) == 0
+    # Ensure first prompt position logprobs is empty
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] == dict()
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, FlatLogprobs)
+    assert len(sample_logprobs.start_indices) == 0
+    assert len(sample_logprobs.end_indices) == 0
+    assert len(sample_logprobs.token_ids) == 0
+    assert len(sample_logprobs.logprobs) == 0
+    assert len(sample_logprobs.ranks) == 0
+    assert len(sample_logprobs.decoded_tokens) == 0
+    assert len(sample_logprobs) == 0
+
+
+def test_append_logprobs_for_next_position_none_flat(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, list)
+    assert logprobs == [
+        {1: Logprob(logprob=0.1, rank=10, decoded_token="1")},
+        {
+            2: Logprob(logprob=0.2, rank=11, decoded_token="2"),
+            3: Logprob(logprob=0.3, rank=1, decoded_token="3"),
+        },
+    ]
+
+
+def test_append_logprobs_for_next_position_flat(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, FlatLogprobs)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 11, 1]
+    assert logprobs.decoded_tokens == ["1", "2", "3"]
+
+
+LOGPROBS_ONE_POSITION_0: LogprobsOnePosition = {
+    1: Logprob(logprob=0.1, rank=10, decoded_token="10")
+}
+LOGPROBS_ONE_POSITION_1: LogprobsOnePosition = {
+    2: Logprob(logprob=0.2, rank=20, decoded_token="20"),
+    3: Logprob(logprob=0.3, rank=30, decoded_token="30"),
+}
+LOGPROBS_ONE_POSITION_2: LogprobsOnePosition = {
+    4: Logprob(logprob=0.4, rank=40, decoded_token="40"),
+    5: Logprob(logprob=0.5, rank=50, decoded_token="50"),
+    6: Logprob(logprob=0.6, rank=60, decoded_token="60"),
+}
+
+
+def test_flat_logprobs_append() -> None:
+    logprobs = FlatLogprobs()
+    logprobs.append(LOGPROBS_ONE_POSITION_0)
+    logprobs.append(LOGPROBS_ONE_POSITION_1)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 20, 30]
+    assert logprobs.decoded_tokens == ["10", "20", "30"]
+
+    logprobs.append(LOGPROBS_ONE_POSITION_2)
+    assert logprobs.start_indices == [0, 1, 3]
+    assert logprobs.end_indices == [1, 3, 6]
+    assert logprobs.token_ids == [1, 2, 3, 4, 5, 6]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs.ranks == [10, 20, 30, 40, 50, 60]
+    assert logprobs.decoded_tokens == ["10", "20", "30", "40", "50", "60"]
+
+
+def test_flat_logprobs_extend() -> None:
+    logprobs = FlatLogprobs()
+    # Extend with list[LogprobsOnePosition]
+    logprobs.extend([LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0])
+    assert logprobs.start_indices == [0, 3]
+    assert logprobs.end_indices == [3, 4]
+    assert logprobs.token_ids == [4, 5, 6, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10"]
+
+    other_logprobs = FlatLogprobs()
+    other_logprobs.extend([LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_0])
+    # Extend with another FlatLogprobs
+    logprobs.extend(other_logprobs)
+    assert logprobs.start_indices == [0, 3, 4, 6]
+    assert logprobs.end_indices == [3, 4, 6, 7]
+    assert logprobs.token_ids == [4, 5, 6, 1, 2, 3, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1, 0.2, 0.3, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10, 20, 30, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10", "20", "30", "10"]
+
+
+def test_flat_logprobs_access() -> None:
+    logprobs = FlatLogprobs()
+    logprobs.extend(
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0]
+    )
+    assert logprobs.start_indices == [0, 2, 5]
+    assert logprobs.end_indices == [2, 5, 6]
+    assert logprobs.token_ids == [2, 3, 4, 5, 6, 1]
+    assert logprobs.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [20, 30, 40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["20", "30", "40", "50", "60", "10"]
+
+    # Test __len__
+    assert len(logprobs) == 3
+
+    # Test __iter__
+    for actual_logprobs, expected_logprobs in zip(
+        logprobs,
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0],
+    ):
+        assert actual_logprobs == expected_logprobs
+
+    # Test __getitem__ : single item
+    assert logprobs[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs[2] == LOGPROBS_ONE_POSITION_0
+
+    # Test __getitem__ : slice
+    logprobs02 = logprobs[:2]
+    assert len(logprobs02) == 2
+    assert logprobs02[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs02[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs02.start_indices == [0, 2]
+    assert logprobs02.end_indices == [2, 5]
+    assert logprobs02.token_ids == [2, 3, 4, 5, 6]
+    assert logprobs02.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs02.ranks == [20, 30, 40, 50, 60]
+    assert logprobs02.decoded_tokens == ["20", "30", "40", "50", "60"]
+    logprobs_last2 = logprobs[-2:]
+    assert len(logprobs_last2) == 2
+    assert logprobs_last2[0] == LOGPROBS_ONE_POSITION_2
+    assert logprobs_last2[1] == LOGPROBS_ONE_POSITION_0
+    assert logprobs_last2.start_indices == [0, 3]
+    assert logprobs_last2.end_indices == [3, 4]
+    assert logprobs_last2.token_ids == [4, 5, 6, 1]
+    assert logprobs_last2.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs_last2.ranks == [40, 50, 60, 10]
+    assert logprobs_last2.decoded_tokens == ["40", "50", "60", "10"]
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index ebf107217c3cb..1ada8ee187c38 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
     assert actual_request == expected_mistral_output
 
 
-# Tool use with list content and reasoning_content
+# Tool use with list content and reasoning
 @pytest.mark.parametrize(
     "openai_request,expected_mistral_output",
     [
@@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
                     },
                     {
                         "role": "assistant",
-                        "reasoning_content": None,
+                        "reasoning": None,
                         "content": None,
                         "tool_calls": [
                             {
@@ -334,20 +334,20 @@ class TestMistralTokenizer:
 
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
-            [1, 22177, 4304, 2662, 2]
+            [1, 22177, 4304, 2662]
             if mistral_tokenizer.is_tekken
-            else [1, 23325, 2294, 1686, 2]
+            else [1, 23325, 2294, 1686]
         )
 
-        assert mistral_tokenizer.encode("Hello world !") == token_ids[:-1]
-        assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-2]
+        assert mistral_tokenizer.encode("Hello world !") == token_ids
+        assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-1]
         assert (
             mistral_tokenizer.encode("Hello world !", truncation=True, max_length=3)
-            == token_ids[:-2]
+            == token_ids[:-1]
         )
         assert (
             mistral_tokenizer.encode("Hello world !", truncation=False, max_length=3)
-            == token_ids[:-1]
+            == token_ids
         )
 
         assert (
@@ -358,7 +358,7 @@ class TestMistralTokenizer:
             mistral_tokenizer.encode(
                 "Hello world !", add_special_tokens=True, max_length=3
             )
-            == token_ids[:-2]
+            == token_ids[:-1]
         )
         assert (
             mistral_tokenizer.encode(
@@ -368,7 +368,7 @@ class TestMistralTokenizer:
         )
         assert (
             mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
-            == token_ids[1:-1]
+            == token_ids[1:]
         )
 
     @pytest.mark.parametrize(
@@ -1088,6 +1088,19 @@ class TestMistralTokenizer:
             == expected_tokens[mistral_tokenizer.is_tekken]
         )
 
+    def test_decode_int(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        ids = 1
+        assert (
+            mistral_tokenizer.decode(
+                ids,
+                skip_special_tokens=False,
+            )
+            == "<s>"
+        )
+
     def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
         tokens = (
             [
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py
index 9b7e71b49c05b..db5168071fbce 100644
--- a/tests/tool_use/test_deepseekv31_tool_parser.py
+++ b/tests/tool_use/test_deepseekv31_tool_parser.py
@@ -3,7 +3,9 @@
 
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers import DeepSeekV31ToolParser
+from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
+    DeepSeekV31ToolParser,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL = "deepseek-ai/DeepSeek-V3.1"
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 0862d14812d72..36a07bb561d9e 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import Ernie45ToolParser
+from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
@@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental(
         if (
             delta_message.role is None
             and delta_message.content is None
-            and delta_message.reasoning_content is None
+            and delta_message.reasoning is None
             and len(delta_message.tool_calls) == 0
         ):
             continue
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index 6f1f6671d9b3c..f545f52c02dcb 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -7,7 +7,9 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser
+from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
+    Glm4MoeModelToolParser,
+)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 6dcdd5ba2ce76..9eb73b80fa9b4 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -9,7 +9,7 @@ import pytest
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import JambaToolParser
+from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index 43b8c70acbfc3..c358589dbc292 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -7,7 +7,7 @@ import json
 import pytest
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import KimiK2ToolParser
+from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 8610656fa288d..4332984083dab 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
+from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
index f6223f3fdce4f..c874a9601ae70 100644
--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -15,7 +15,7 @@ from openai_harmony import (
 )
 
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
+from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL = "gpt2"
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 1133b949f2270..1367ad87cb019 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
+from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 8c27b2911f8f9..122b427d60409 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
+from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_get_processor_kwargs_from_processor.py
new file mode 100644
index 0000000000000..95ff9a557fa05
--- /dev/null
+++ b/tests/transformers_utils/test_get_processor_kwargs_from_processor.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+
+from transformers.processing_utils import ProcessingKwargs
+from typing_extensions import Unpack
+
+from vllm.transformers_utils.processor import (
+    get_processor_kwargs_from_processor,
+)
+
+
+class _FakeProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore
+    pass
+
+
+def _assert_has_all_expected(keys: set[str]) -> None:
+    # text
+    for k in ("text_pair", "text_target", "text_pair_target"):
+        assert k in keys
+    # image
+    for k in ("do_convert_rgb", "do_resize"):
+        assert k in keys
+    # audio
+    for k in (
+        "fps",
+        "do_sample_frames",
+        "input_data_format",
+        "default_to_square",
+    ):
+        assert k in keys
+    # audio
+    for k in ("padding", "return_attention_mask"):
+        assert k in keys
+
+
+# Path 1: __call__ method has kwargs: Unpack[*ProcessingKwargs]
+class _ProcWithUnpack:
+    def __call__(self, *args, **kwargs: Unpack[_FakeProcessorKwargs]):  # type: ignore
+        return None
+
+
+def test_get_processor_kwargs_from_processor_unpack_path_returns_full_union():
+    proc = _ProcWithUnpack()
+    keys = get_processor_kwargs_from_processor(proc)
+    _assert_has_all_expected(keys)
+
+
+# ---- Path 2: No Unpack, fallback to scanning *ProcessingKwargs in module ----
+
+
+class _ProcWithoutUnpack:
+    def __call__(self, *args, **kwargs):
+        return None
+
+
+def test_get_processor_kwargs_from_processor_module_scan_returns_full_union():
+    # ensure the module scanned by fallback is this test module
+    module_name = _ProcWithoutUnpack.__module__
+    mod = importlib.import_module(module_name)
+    assert hasattr(mod, "_FakeProcessorKwargs")
+
+    proc = _ProcWithoutUnpack()
+    keys = get_processor_kwargs_from_processor(proc)
+    _assert_has_all_expected(keys)
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
new file mode 100644
index 0000000000000..beaef04d766bf
--- /dev/null
+++ b/tests/transformers_utils/test_utils.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
+
+
+def test_is_gcs():
+    assert is_gcs("gs://model-path")
+    assert not is_gcs("s3://model-path/path-to-model")
+    assert not is_gcs("/unix/local/path")
+    assert not is_gcs("nfs://nfs-fqdn.local")
+
+
+def test_is_s3():
+    assert is_s3("s3://model-path/path-to-model")
+    assert not is_s3("gs://model-path")
+    assert not is_s3("/unix/local/path")
+    assert not is_s3("nfs://nfs-fqdn.local")
+
+
+def test_is_cloud_storage():
+    assert is_cloud_storage("gs://model-path")
+    assert is_cloud_storage("s3://model-path/path-to-model")
+    assert not is_cloud_storage("/unix/local/path")
+    assert not is_cloud_storage("nfs://nfs-fqdn.local")
diff --git a/tests/utils.py b/tests/utils.py
index 8fac003b8e9bf..1fc8b260d1e9b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -254,6 +254,23 @@ class RemoteOpenAIServer:
             **kwargs,
         )
 
+    def get_client_anthropic(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return anthropic.Anthropic(
+            base_url=self.url_for(),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
+
+    def get_async_client_anthropic(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
+        return anthropic.AsyncAnthropic(
+            base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
+        )
+
 
 class RemoteOpenAIServerCustom(RemoteOpenAIServer):
     """Launch test server with custom child process"""
@@ -300,131 +317,6 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
             self.proc.kill()
 
 
-class RemoteAnthropicServer:
-    DUMMY_API_KEY = "token-abc123"  # vLLM's Anthropic server does not need API key
-
-    def __init__(
-        self,
-        model: str,
-        vllm_serve_args: list[str],
-        *,
-        env_dict: dict[str, str] | None = None,
-        seed: int | None = 0,
-        auto_port: bool = True,
-        max_wait_seconds: float | None = None,
-    ) -> None:
-        if auto_port:
-            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
-                raise ValueError(
-                    "You have manually specified the port when `auto_port=True`."
-                )
-
-            # Don't mutate the input args
-            vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
-        if seed is not None:
-            if "--seed" in vllm_serve_args:
-                raise ValueError(
-                    f"You have manually specified the seed when `seed={seed}`."
-                )
-
-            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
-
-        parser = FlexibleArgumentParser(description="vLLM's remote Anthropic server.")
-        subparsers = parser.add_subparsers(required=False, dest="subparser")
-        parser = ServeSubcommand().subparser_init(subparsers)
-        args = parser.parse_args(["--model", model, *vllm_serve_args])
-        self.host = str(args.host or "localhost")
-        self.port = int(args.port)
-
-        self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
-
-        # download the model before starting the server to avoid timeout
-        is_local = os.path.isdir(model)
-        if not is_local:
-            engine_args = AsyncEngineArgs.from_cli_args(args)
-            model_config = engine_args.create_model_config()
-            load_config = engine_args.create_load_config()
-
-            model_loader = get_model_loader(load_config)
-            model_loader.download_model(model_config)
-
-        env = os.environ.copy()
-        # the current process might initialize cuda,
-        # to be safe, we should use spawn method
-        env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-        if env_dict is not None:
-            env.update(env_dict)
-        self.proc = subprocess.Popen(
-            [
-                sys.executable,
-                "-m",
-                "vllm.entrypoints.anthropic.api_server",
-                model,
-                *vllm_serve_args,
-            ],
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
-        max_wait_seconds = max_wait_seconds or 240
-        self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.proc.terminate()
-        try:
-            self.proc.wait(8)
-        except subprocess.TimeoutExpired:
-            # force kill if needed
-            self.proc.kill()
-
-    def _wait_for_server(self, *, url: str, timeout: float):
-        # run health check
-        start = time.time()
-        while True:
-            try:
-                if requests.get(url).status_code == 200:
-                    break
-            except Exception:
-                # this exception can only be raised by requests.get,
-                # which means the server is not ready yet.
-                # the stack trace is not useful, so we suppress it
-                # by using `raise from None`.
-                result = self.proc.poll()
-                if result is not None and result != 0:
-                    raise RuntimeError("Server exited unexpectedly.") from None
-
-                time.sleep(0.5)
-                if time.time() - start > timeout:
-                    raise RuntimeError("Server failed to start in time.") from None
-
-    @property
-    def url_root(self) -> str:
-        return f"http://{self.host}:{self.port}"
-
-    def url_for(self, *parts: str) -> str:
-        return self.url_root + "/" + "/".join(parts)
-
-    def get_client(self, **kwargs):
-        if "timeout" not in kwargs:
-            kwargs["timeout"] = 600
-        return anthropic.Anthropic(
-            base_url=self.url_for(),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-            **kwargs,
-        )
-
-    def get_async_client(self, **kwargs):
-        if "timeout" not in kwargs:
-            kwargs["timeout"] = 600
-        return anthropic.AsyncAnthropic(
-            base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
-        )
-
-
 def _test_completion(
     client: openai.OpenAI,
     model: str,
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 6659b3eb1e98f..08aeb6f298f61 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -295,6 +295,7 @@ def _test_backend_correctness(
     block_size: int = 16,
     atol: float = 1e-2,
     rtol: float = 1e-2,
+    tensor_parallel_size: int = 1,
 ):
     """
     Test that all backends produce similar outputs to a reference implementation
@@ -310,13 +311,38 @@ def _test_backend_correctness(
     4. Running each vLLM attention backend with the new queries and the
        simulated paged KV cache.
     5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+
+    Note: When tensor_parallel_size > 1, we simulate the head partitioning
+    by overriding the model config to use fewer heads, without requiring
+    multiple GPUs. This tests that backends work correctly with different
+    head counts.
     """
     current_platform.seed_everything(42)
+
+    hf_config_override = None
+    if tensor_parallel_size > 1:
+        from vllm.config import ModelConfig
+
+        temp_config = ModelConfig(model=model, max_model_len=1)
+        original_num_heads = temp_config.hf_text_config.num_attention_heads
+        original_num_kv_heads = getattr(
+            temp_config.hf_text_config, "num_key_value_heads", None
+        )
+        hf_config_override = {
+            "num_attention_heads": original_num_heads // tensor_parallel_size,
+        }
+        if original_num_kv_heads is not None:
+            hf_config_override["num_key_value_heads"] = max(
+                1, original_num_kv_heads // tensor_parallel_size
+            )
+
     vllm_config = create_vllm_config(
         model_name=model,
+        tensor_parallel_size=1,  # Always use TP=1 to avoid multi-GPU requirements
         max_model_len=max(batch_spec.seq_lens),
         block_size=block_size,
         num_gpu_blocks=8192,
+        hf_config_override=hf_config_override,
     )
     device = torch.device("cuda:0")
 
@@ -503,7 +529,10 @@ def _test_backend_correctness(
     ],
 )
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_causal_backend_correctness(batch_spec_name: str, model: str):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_causal_backend_correctness(
+    batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """Test backend's correctness with causal attention."""
 
     def causal_mask_mod(
@@ -523,12 +552,23 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str):
     SMALL_BLOCK_BACKENDS = [
         x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
     ]
-    _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, causal_mask_mod)
+    _test_backend_correctness(
+        batch_spec,
+        model,
+        SMALL_BLOCK_BACKENDS,
+        causal_mask_mod,
+        tensor_parallel_size=tensor_parallel_size,
+    )
 
     # Fast FlexAttention needs to run with block_size=128
     if LARGE_BLOCK_BACKENDS:
         _test_backend_correctness(
-            batch_spec, model, LARGE_BLOCK_BACKENDS, causal_mask_mod, block_size=128
+            batch_spec,
+            model,
+            LARGE_BLOCK_BACKENDS,
+            causal_mask_mod,
+            block_size=128,
+            tensor_parallel_size=tensor_parallel_size,
         )
 
 
@@ -545,7 +585,10 @@ SLIDING_WINDOW_BACKENDS_TO_TEST = [
     ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
 )
 @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
-def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_sliding_window_backend_correctness(
+    batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """Test backend's correctness with sliding window attention."""
 
     def sliding_window_mask_mod(
@@ -575,7 +618,11 @@ def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
         x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
     ]
     _test_backend_correctness(
-        batch_spec, model, SMALL_BLOCK_BACKENDS, sliding_window_mask_mod_fn
+        batch_spec,
+        model,
+        SMALL_BLOCK_BACKENDS,
+        sliding_window_mask_mod_fn,
+        tensor_parallel_size=tensor_parallel_size,
     )
 
     # Fast FlexAttention needs to run with block_size=128
@@ -586,4 +633,5 @@ def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
             LARGE_BLOCK_BACKENDS,
             sliding_window_mask_mod_fn,
             block_size=128,
+            tensor_parallel_size=tensor_parallel_size,
         )
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index cda4fb11c096e..5679fafe63ee8 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -394,8 +394,11 @@ def run_attention_backend(
         "spec_decode_medium",
     ],
 )
-@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
-def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
+@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
+@pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
+def test_backend_correctness(
+    dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
+):
     """
     Test that all backends produce similar outputs to a reference implementation
     using torch.nn.functional.scaled_dot_product_attention.
@@ -410,6 +413,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     4. Running each vLLM attention backend with the new queries and the
        simulated paged KV cache.
     5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+
+    Note: When tensor_parallel_size > 1, we simulate the head partitioning
+    by overriding the model config to use fewer heads, without requiring
+    multiple GPUs. This tests that backends work correctly with different
+    head counts.
     """
 
     batch_spec = BATCH_SPECS[batch_spec_name]
@@ -423,11 +431,30 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     # Add 1 for null block at index 0, and some buffer
     num_gpu_blocks = required_blocks + 1 + 100
 
+    hf_config_override = None
+    if tensor_parallel_size > 1:
+        from vllm.config import ModelConfig
+
+        temp_config = ModelConfig(model=model, max_model_len=1)
+        original_num_heads = temp_config.hf_text_config.num_attention_heads
+        original_num_kv_heads = getattr(
+            temp_config.hf_text_config, "num_key_value_heads", None
+        )
+        hf_config_override = {
+            "num_attention_heads": original_num_heads // tensor_parallel_size,
+        }
+        if original_num_kv_heads is not None:
+            hf_config_override["num_key_value_heads"] = max(
+                1, original_num_kv_heads // tensor_parallel_size
+            )
+
     vllm_config = create_vllm_config(
         model_name=model,
+        tensor_parallel_size=1,  # Always use TP=1 to avoid multi-GPU requirements
         max_model_len=max(batch_spec.seq_lens),
         num_gpu_blocks=num_gpu_blocks,
         block_size=default_block_size,
+        hf_config_override=hf_config_override,
     )
 
     # For spec decode tests, add a speculative_config to set the reorder_batch_threshold
diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py
index 02324d2aca6ef..b34d587eb362d 100644
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -113,7 +113,10 @@ def _quantize_dequantize_fp8_ds_mla(
 
 @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys()))
 @pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"])
-def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype):
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
+def test_sparse_backend_decode_correctness(
+    dist_init, batch_name, kv_cache_dtype, tensor_parallel_size
+):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is required for sparse MLA decode test")
 
@@ -135,8 +138,11 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype
     total_cache_tokens = sum(batch_spec.seq_lens)
     block_size = 64
 
+    # Note: We use TP=1 to avoid multi-GPU requirements in CI.
+    # The test simulates head partitioning via mocked methods below.
     vllm_config = create_vllm_config(
         model_name="deepseek-ai/DeepSeek-V2-Lite-Chat",
+        tensor_parallel_size=1,
         max_model_len=max_seqlen,
         num_gpu_blocks=max(2048, cdiv(total_cache_tokens, block_size) + 1),
         block_size=block_size,
@@ -156,7 +162,8 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype
     )
     model_config.dtype = dtype
     model_config.get_num_attention_heads = MethodType(
-        lambda self, parallel_config: num_heads, model_config
+        lambda self, parallel_config: max(1, num_heads // tensor_parallel_size),
+        model_config,
     )
     model_config.get_num_kv_heads = MethodType(
         lambda self, parallel_config: 1, model_config
diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 6d870b5640dfb..e0645ed43015e 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -34,15 +34,20 @@ def test_stop_by_max_tokens(max_tokens: int):
     requests = create_requests(num_requests=2, max_tokens=max_tokens)
     req0, req1 = requests
 
+    expected_total_num_scheduled_tokens = 0
     sched_outputs: deque[SchedulerOutput] = deque()
     scheduler.add_request(req0)
     sched_outputs.append(scheduler.schedule())
+    expected_total_num_scheduled_tokens += req0.num_prompt_tokens + max_tokens - 1
 
     scheduler.add_request(req1)
     sched_outputs.append(scheduler.schedule())
+    expected_total_num_scheduled_tokens += req1.num_prompt_tokens + max_tokens - 1
 
+    total_num_scheduled_tokens = 0
     while sched_outputs:
         sched_output = sched_outputs.popleft()
+        total_num_scheduled_tokens += sched_output.total_num_scheduled_tokens
         model_runner_output = _make_model_runner_output(sched_output)
         scheduler.update_from_output(sched_output, model_runner_output)
 
@@ -53,6 +58,8 @@ def test_stop_by_max_tokens(max_tokens: int):
     assert scheduler.get_num_unfinished_requests() == 0
     assert req0.num_output_tokens == max_tokens
     assert req1.num_output_tokens == max_tokens
+    # Ensure we aren't scheduling more tokens than necessary.
+    assert total_num_scheduled_tokens == expected_total_num_scheduled_tokens
 
 
 def test_abort():
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 837a513cb75e1..2291f363731f2 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,7 +9,8 @@ import pytest
 import torch
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
-from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
+from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved, BlockStored
+from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
     MultiModalKwargsItem,
@@ -59,6 +60,7 @@ def make_request(
     mm_hashes: list[str] | None = None,
     prompt_logprobs: int | None = None,
     cache_salt: str | None = None,
+    lora_request: LoRARequest | None = None,
 ):
     mm_features = []
     if mm_positions is not None:
@@ -79,7 +81,7 @@ def make_request(
         sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs),
         pooling_params=None,
         eos_token_id=100,
-        lora_request=None,
+        lora_request=lora_request,
         cache_salt=cache_salt,
         block_hasher=get_request_block_hasher(block_size, hash_fn),
     )
@@ -1337,6 +1339,63 @@ def test_kv_cache_events(blocks_to_cache: int):
     assert len(manager.block_pool.cached_block_hash_to_block) == 0
 
 
+@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
+def test_kv_cache_events_with_lora(blocks_to_cache: int):
+    """Test BlockStored events contain correct lora_id when using LoRA requests."""
+    block_size = 16
+    num_blocks = blocks_to_cache + 1
+
+    # Create KVCacheManager with events enabled
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks),
+        max_model_len=8192,
+        enable_caching=True,
+        enable_kv_cache_events=True,
+    )
+
+    # Test with LoRA request
+    lora_request = LoRARequest(
+        lora_name="test_lora", lora_int_id=42, lora_path="/test/path"
+    )
+
+    num_tokens = block_size * blocks_to_cache
+    req_with_lora = make_request(
+        "lora_req",
+        list(range(num_tokens)),
+        block_size,
+        sha256,
+        lora_request=lora_request,
+    )
+
+    # Allocate slots and get events
+    _ = manager.allocate_slots(req_with_lora, num_tokens)
+    events = manager.take_events()
+
+    # Verify BlockStored event contains correct lora_id
+    block_stored_event = events[-1]
+    assert isinstance(block_stored_event, BlockStored)
+    assert block_stored_event.lora_id == 42  # Should match lora_request.adapter_id
+    assert len(block_stored_event.block_hashes) == blocks_to_cache
+    assert block_stored_event.block_size == block_size
+
+    # Clean up
+    manager.free(req_with_lora)
+
+    # Test without LoRA request (should have lora_id=None)
+    req_without_lora = make_request(
+        "no_lora_req", list(range(num_tokens)), block_size, sha256
+    )
+
+    _ = manager.allocate_slots(req_without_lora, num_tokens)
+    events = manager.take_events()
+
+    block_stored_event = events[-1]
+    assert isinstance(block_stored_event, BlockStored)
+    assert block_stored_event.lora_id is None  # Should be None when no LoRA request
+    assert len(block_stored_event.block_hashes) == blocks_to_cache
+    assert block_stored_event.block_size == block_size
+
+
 def test_eagle_enabled_removes_last_block():
     """Verify Eagle does NOT remove blocks when request
     length is divisible by block size."""
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 92e3831b9c7a6..749cf7dc8397e 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -891,7 +891,6 @@ def test_kv_connector_basic():
     scheduler = create_scheduler(
         enable_prefix_caching=True,
         use_kv_connector=True,
-        disable_hybrid_kv_cache_manager=True,
     )
     NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
     BLOCK_SIZE = scheduler.cache_config.block_size
@@ -1017,7 +1016,6 @@ def test_external_prefix_cache_metrics():
     scheduler = create_scheduler(
         enable_prefix_caching=False,
         use_kv_connector=True,
-        disable_hybrid_kv_cache_manager=True,
     )
 
     # Mock connector to simulate a partial external cache hit
@@ -1082,7 +1080,6 @@ def test_kv_connector_unable_to_allocate():
         use_kv_connector=True,
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
-        disable_hybrid_kv_cache_manager=True,
     )
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
@@ -1166,7 +1163,6 @@ def test_kv_connector_handles_preemption():
         use_kv_connector=True,
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
-        disable_hybrid_kv_cache_manager=True,
     )
 
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
@@ -1383,7 +1379,6 @@ def create_scheduler_with_priority(
     block_size: int = 16,
     max_model_len: int | None = None,
     num_speculative_tokens: int | None = None,
-    disable_hybrid_kv_cache_manager: bool = False,
 ) -> Scheduler:
     """Create scheduler with priority policy enabled.
 
@@ -1408,7 +1403,6 @@ def create_scheduler_with_priority(
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=True,
         policy="priority",  # Enable priority scheduling
-        disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
     )
     model_config = ModelConfig(
         model=model,
@@ -2015,7 +2009,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
         num_blocks=5,  # Can hold 64 tokens (first block is null)
         block_size=16,  # Standard block size
         use_kv_connector=True,
-        disable_hybrid_kv_cache_manager=True,
     )
 
     # Create a request and schedule it
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 3f5e1b9eeaf73..6e739d6b0e77a 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -46,7 +46,6 @@ def create_scheduler(
     num_speculative_tokens: int | None = None,
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
-    disable_hybrid_kv_cache_manager: bool = False,
 ) -> Scheduler | AsyncScheduler:
     """Create scheduler under test.
 
@@ -71,7 +70,6 @@ def create_scheduler(
         disable_chunked_mm_input=disable_chunked_mm_input,
         enable_chunked_prefill=True,
         async_scheduling=async_scheduling,
-        disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
     )
     model_config = ModelConfig(
         model=model,
diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 98d6ef7dbf440..60f9017184ea0 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -20,13 +20,6 @@ from vllm.v1.metrics.stats import IterationStats, MultiModalCacheStats, Schedule
 
 DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
-engine_args = AsyncEngineArgs(
-    model="ibm-research/PowerMoE-3b",
-    enforce_eager=True,
-    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
-    data_parallel_size=DP_SIZE,
-)
-
 
 async def generate(
     engine: AsyncLLM,
@@ -65,6 +58,13 @@ async def generate(
     return count, request_id
 
 
+@pytest.mark.parametrize(
+    "model",
+    [
+        "ibm-research/PowerMoE-3b",
+        "hmellor/tiny-random-LlamaForCausalLM",
+    ],
+)
 @pytest.mark.parametrize(
     "output_kind",
     [
@@ -76,7 +76,10 @@ async def generate(
 @pytest.mark.parametrize("async_scheduling", [True, False])
 @pytest.mark.asyncio
 async def test_load(
-    output_kind: RequestOutputKind, data_parallel_backend: str, async_scheduling: bool
+    model: str,
+    output_kind: RequestOutputKind,
+    data_parallel_backend: str,
+    async_scheduling: bool,
 ):
     if async_scheduling and data_parallel_backend == "ray":
         # TODO(NickLucche) Re-enable when async scheduling is supported
@@ -107,8 +110,14 @@ async def test_load(
     with ExitStack() as after:
         prompt = "This is a test of data parallel"
 
-        engine_args.data_parallel_backend = data_parallel_backend
-        engine_args.async_scheduling = async_scheduling
+        engine_args = AsyncEngineArgs(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+            data_parallel_size=DP_SIZE,
+            data_parallel_backend=data_parallel_backend,
+            async_scheduling=async_scheduling,
+        )
         engine = AsyncLLM.from_engine_args(
             engine_args, stat_loggers=[SimpleStatsLogger]
         )
diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
new file mode 100644
index 0000000000000..866ae742bf3c0
--- /dev/null
+++ b/tests/v1/distributed/test_dbo.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism.
+
+DBO is specifically designed for DP+EP scenarios to hide communication latency
+by overlapping computation of two batches. This test validates that DBO works
+correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
+"""
+
+import pytest
+
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+DP_SIZE = 2
+
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI; but must be large enough to hit dbo thresholds
+NUM_SHOTS = 5  # Few-shot examples
+MIN_ACCURACY = 0.62  # Expected 0.64 with 2% buffer (based on vLLM test data)
+
+# Increase max_num_seqs to trigger DBO for decode batches
+# With 64 seqs, decode batches should exceed the 32 token threshold
+MAX_NUM_SEQS = 64  # Increased from 16 to trigger decode DBO
+
+# DeepEP backends to test
+DEEPEP_BACKENDS = [
+    "deepep_low_latency",
+    "deepep_high_throughput",
+]
+
+
+@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
+def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
+    """
+    Test DBO with DP+EP using GSM8K evaluation.
+    """
+    required_gpus = DP_SIZE
+
+    if num_gpus_available < required_gpus:
+        pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})")
+
+    # Server arguments for DBO + DP + EP
+    server_args = [
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),  # Use larger batch to trigger decode DBO
+        "--trust-remote-code",
+        # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching
+        "--data-parallel-size",
+        str(DP_SIZE),
+        "--enable-expert-parallel",
+        "--enable-dbo",
+        # Fix threshold so we know we trigger DBO
+        "--dbo-decode-token-threshold",
+        "16",
+        "--dbo-prefill-token-threshold",
+        "256",
+        "--all2all-backend",
+        all2all_backend,
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        server_args,
+        max_wait_seconds=600,  # Allow time for model loading with DP+EP
+    ) as remote_server:
+        # Use host and port directly from RemoteOpenAIServer
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
+
+        # Run GSM8K evaluation
+        results = evaluate_gsm8k(
+            num_questions=NUM_QUESTIONS,
+            num_shots=NUM_SHOTS,
+            host=host,
+            port=port,
+        )
+
+        # Validate accuracy is reasonable
+        accuracy = results["accuracy"]
+        assert accuracy >= MIN_ACCURACY, (
+            f"DBO+DP+EP accuracy too low ({all2all_backend}): "
+            f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
+            f"(correct: {results['num_correct']}/{results['num_questions']})"
+        )
diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py
new file mode 100644
index 0000000000000..14532f2795443
--- /dev/null
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script contains:
+1. test lora with speculative decoding for batch inference
+"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+LORA_TEST_PROMPT_MAP: dict[str, str] = {}
+
+LORA_TEST_PROMPT_MAP["premjatin/qwen-linear-algebra-coder"] = """
+### INSTRUCTION:
+You are an AI assistant that generates Python code to solve linear
+algebra problems.
+
+### PROBLEM:
+Find the eigenvalues and eigenvectors of the following 3x3 matrix:
+[[3, 2, 0],
+ [2, 3, 0],
+ [0, 0, 2]]
+
+### OUTPUT FORMAT (STRICT):
+Numbers should be represented as integers only.
+
+### PYTHON SOLUTION:
+"""
+
+SEED = 42
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        (
+            "eagle3",
+            "Qwen/Qwen3-1.7B",
+            "AngelSlim/Qwen3-1.7B_eagle3",
+            "premjatin/qwen-linear-algebra-coder",
+            1,
+        )
+    ],
+)
+def test_batch_inference_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    model_setup: tuple[str, str, str, str, int],
+):
+    """
+    Compare the outputs of a LLM with only Lora and a LLM with both SD and Lora.
+    Should be the same and no failure when doing batch inference.
+    model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Disable randomness
+        m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
+        torch.manual_seed(SEED)
+        np.random.seed(SEED)
+        random.seed(SEED)
+        torch.cuda.manual_seed_all(SEED)
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+
+        method, model_name, spec_model_name, lora_path, tp_size = model_setup
+
+        # without speculative decoding
+        ref_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        prompts = [LORA_TEST_PROMPT_MAP[lora_path]] * 100
+        lora_request = LoRARequest("adapter", 1, lora_path)
+        sampling_params = SamplingParams(
+            temperature=0.0, top_p=1.0, top_k=-1, seed=SEED, max_tokens=128
+        )
+
+        ref_outputs = ref_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        lora_spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        lora_spec_outputs = lora_spec_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, lora_spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 90% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        print(f"match ratio: {matches}/{len(ref_outputs)}")
+        assert matches > int(0.90 * len(ref_outputs))
+        del lora_spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 45b48e5858934..4a6b84ae4817c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -75,7 +75,23 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-def test_ngram_correctness(
+@pytest.mark.parametrize(
+    "speculative_config",
+    [
+        {
+            "method": "ngram",
+            "prompt_lookup_max": 5,
+            "prompt_lookup_min": 3,
+            "num_speculative_tokens": 3,
+        },
+        {
+            "method": "suffix",
+            "suffix_decoding_max_spec_factor": 2.0,
+        },
+    ],
+)
+def test_ngram_and_suffix_correctness(
+    speculative_config: dict,
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
     model_name: str,
@@ -94,12 +110,7 @@ def test_ngram_correctness(
 
     spec_llm = LLM(
         model=model_name,
-        speculative_config={
-            "method": "ngram",
-            "prompt_lookup_max": 5,
-            "prompt_lookup_min": 3,
-            "num_speculative_tokens": 3,
-        },
+        speculative_config=speculative_config,
         max_model_len=1024,
     )
     spec_outputs = spec_llm.chat(test_prompts, sampling_config)
@@ -121,6 +132,65 @@ def test_ngram_correctness(
     cleanup_dist_env_and_memory()
 
 
+def test_suffix_decoding_acceptance(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    """
+    Check that suffix decoding caching takes effect and improves acceptance
+    lengths and acceptance rates over multiple runs of the same prompts.
+    """
+    test_prompts = get_test_prompts(mm_enabled=False)
+
+    spec_llm = LLM(
+        model=model_name,
+        speculative_config={
+            "method": "suffix",
+            "suffix_decoding_max_spec_factor": 2.0,
+            "suffix_decoding_max_cached_requests": 1000,
+        },
+        max_model_len=1024,
+        disable_log_stats=False,
+    )
+
+    # Run several times and check that the accepted tokens increase.
+    num_draft = []
+    num_accept = []
+    for i in range(10):  # Run multiple times to warm up the cache.
+        spec_llm.chat(test_prompts, sampling_config)
+        # Collect draft and acceptance stats.
+        metrics = spec_llm.get_metrics()
+        for metric in metrics:
+            if metric.name == "vllm:spec_decode_num_draft_tokens":
+                num_draft.append(metric.value)
+            if metric.name == "vllm:spec_decode_num_accepted_tokens":
+                num_accept.append(metric.value)
+
+    # Calculate the acceptance rates for the first and last runs.
+    first_accept_tokens = num_accept[0]
+    first_draft_tokens = num_draft[0]
+    first_accept_rate = first_accept_tokens / first_draft_tokens
+
+    # Take the diff since the stats are cumulative.
+    last_accept_tokens = num_accept[-1] - num_accept[-2]
+    last_draft_tokens = num_draft[-1] - num_draft[-2]
+    last_accept_rate = last_accept_tokens / last_draft_tokens
+
+    # Expect the acceptance length to improve.
+    assert first_accept_tokens < last_accept_tokens
+
+    # Expect the acceptance rate to improve.
+    assert first_accept_rate < last_accept_rate
+
+    # Heuristic: expect at least 85% acceptance rate at the end.
+    assert last_accept_rate > 0.85
+
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+
 @pytest.mark.parametrize(
     "model_path",
     [
@@ -202,9 +272,9 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled"],
+    ["model_setup", "mm_enabled", "chunked_prefill_enabled"],
     [
-        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
         pytest.param(
             (
                 "eagle3",
@@ -213,11 +283,12 @@ def test_speculators_model_integration(
                 1,
             ),
             False,
+            False,
             marks=pytest.mark.skip(
                 reason="Skipping due to its head_dim not being a a multiple of 32"
             ),
         ),
-        (
+        pytest.param(
             (
                 "eagle",
                 "meta-llama/Llama-3.1-8B-Instruct",
@@ -225,7 +296,9 @@ def test_speculators_model_integration(
                 1,
             ),
             False,
-        ),
+            True,
+            marks=large_gpu_mark(min_gb=40),
+        ),  # works on 4x H100
         (
             (
                 "eagle3",
@@ -234,6 +307,7 @@ def test_speculators_model_integration(
                 1,
             ),
             False,
+            False,
         ),
         pytest.param(
             (
@@ -243,6 +317,7 @@ def test_speculators_model_integration(
                 4,
             ),
             False,
+            False,
             marks=large_gpu_mark(min_gb=80),
         ),  # works on 4x H100
         pytest.param(
@@ -253,6 +328,7 @@ def test_speculators_model_integration(
                 4,
             ),
             True,
+            True,
             marks=large_gpu_mark(min_gb=80),
         ),  # works on 4x H100
         (
@@ -263,6 +339,7 @@ def test_speculators_model_integration(
                 1,
             ),
             False,
+            False,
         ),
     ],
     ids=[
@@ -281,6 +358,7 @@ def test_eagle_correctness(
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
+    chunked_prefill_enabled: bool,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -317,9 +395,13 @@ def test_eagle_correctness(
             m.setenv("VLLM_ROCM_USE_AITER", "1")
 
         method, model_name, spec_model_name, tp_size = model_setup
+        max_model_len = 2048
+        max_num_batched_tokens = max_model_len
+        if chunked_prefill_enabled:
+            max_num_batched_tokens = 128
 
         ref_llm = LLM(
-            model=model_name, max_model_len=2048, tensor_parallel_size=tp_size
+            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
         )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -334,9 +416,11 @@ def test_eagle_correctness(
                 "method": method,
                 "model": spec_model_name,
                 "num_speculative_tokens": 3,
-                "max_model_len": 2048,
+                "max_model_len": max_model_len,
             },
-            max_model_len=2048,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=chunked_prefill_enabled,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -349,9 +433,9 @@ def test_eagle_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Heuristic: expect at least 60% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
+        assert matches > int(0.6 * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index c9605ea1b07c0..25af55baa91f4 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -424,15 +424,12 @@ async def test_customize_loggers(monkeypatch):
 
 
 @pytest.mark.asyncio
-async def test_customize_aggregated_loggers(monkeypatch):
+async def test_customize_aggregated_loggers():
     """Test that we can customize the aggregated loggers.
     If a customized logger is provided at the init, it should
     be added to the default loggers.
     """
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(
                 TEXT_ENGINE_ARGS,
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index cf632f1469893..e96759ed66a79 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -53,10 +53,12 @@ def test_defaults_with_usage_context():
     vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
 
     from vllm.platforms import current_platform
+    from vllm.utils.mem_constants import GiB_bytes
 
+    device_memory = current_platform.get_device_total_memory()
     device_name = current_platform.get_device_name().lower()
-    if "h100" in device_name or "h200" in device_name:
-        # For H100 and H200, we use larger default values.
+    if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
+        # For GPUs like H100, H200, and MI300x with >= 70GB memory
         default_llm_tokens = 16384
         default_server_tokens = 8192
         default_max_num_seqs = 1024
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 534b60312fd19..84441aa7d28ca 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -66,7 +66,7 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 1
     assert len(engine_core.scheduler.running) == 0
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 1
 
@@ -75,7 +75,7 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 1
     assert len(engine_core.scheduler.running) == 1
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 2
 
@@ -85,12 +85,12 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 2
     assert len(engine_core.scheduler.running) == 2
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 4
 
     # Loop through until they are all done.
-    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+    while (outs := engine_core.step_fn()[0].get(0)) and outs.outputs:
         pass
 
     assert len(engine_core.scheduler.waiting) == 0
@@ -107,7 +107,7 @@ def test_engine_core():
     assert engine_core.scheduler.has_unfinished_requests()
     assert not engine_core.scheduler.has_finished_requests()
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 1
     assert engine_core.scheduler.has_unfinished_requests()
@@ -119,7 +119,7 @@ def test_engine_core():
     assert not engine_core.scheduler.has_unfinished_requests()
     assert engine_core.scheduler.has_finished_requests()
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert not engine_core.scheduler.has_unfinished_requests()
     assert not engine_core.scheduler.has_finished_requests()
 
@@ -133,7 +133,7 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 2
     assert len(engine_core.scheduler.running) == 0
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 2
 
@@ -141,7 +141,7 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 1
     assert len(engine_core.scheduler.running) == 2
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 3
 
@@ -150,7 +150,7 @@ def test_engine_core():
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 2
 
-    _ = engine_core.step()
+    _ = engine_core.step_fn()
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 2
 
@@ -165,12 +165,12 @@ def test_engine_core():
     req0.request_id = req1.request_id = "test"
     engine_core.add_request(*engine_core.preprocess_add_request(req0))
 
-    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-        pass
+    while engine_core.scheduler.has_requests():
+        engine_core.step_fn()
 
     engine_core.add_request(*engine_core.preprocess_add_request(req1))
-    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-        pass
+    while engine_core.scheduler.has_requests():
+        engine_core.step_fn()
 
     assert len(engine_core.scheduler.waiting) == 0
     assert len(engine_core.scheduler.running) == 0
@@ -208,8 +208,8 @@ def test_engine_core_advanced_sampling():
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         # Loop through until they are all done.
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+        while engine_core.scheduler.has_requests():
+            engine_core.step_fn()
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
 
@@ -297,6 +297,8 @@ def test_engine_core_concurrent_batches():
         max_num_batched_tokens=10,
         # Reduce startup time.
         enforce_eager=True,
+        # Test concurrent batch behaviour independently of async scheduling.
+        async_scheduling=False,
     )
     vllm_config = engine_args.create_engine_config()
     with set_default_torch_num_threads(1):
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 28ebe5166d962..d77a119ec60f8 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -15,12 +15,19 @@ from tests.v1.engine.utils import (
 )
 from vllm import PoolingParams
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
+from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import (
+    EngineCoreEvent,
+    EngineCoreEventType,
+    EngineCoreOutputs,
+    EngineCoreRequest,
+    FinishReason,
+)
 from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 
 def _ref_convert_id_to_token(
@@ -895,6 +902,170 @@ def test_iteration_stats(dummy_test_vectors):
     assert iteration_stats.num_generation_tokens == num_active
 
 
+@pytest.mark.parametrize("log_stats", [True, False])
+def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
+    """Test LoRA request lifecycle tracking through waiting -> running -> finished."""
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=log_stats
+    )
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
+
+    # Create LoRA requests
+    lora1 = LoRARequest(lora_name="lora-1", lora_int_id=1, lora_path="/path/to/lora1")
+    lora2 = LoRARequest(lora_name="lora-2", lora_int_id=2, lora_path="/path/to/lora2")
+
+    # Create requests with different LoRA adapters:
+    # - request-0: lora-1
+    # - request-1: lora-2
+    # - request-2: None (no LoRA)
+    lora_assignments = [lora1, lora2, None]
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=lora_assignments[idx],
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(),
+            pooling_params=None,
+        )
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    # Add all requests to the OutputProcessor
+    for request in requests:
+        output_processor.add_request(request, None)
+
+    # First iteration: process outputs with QUEUED events
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    for output in outputs.outputs:
+        output.events = [
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, engine_core_timestamp)
+        ]
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # Verify waiting counts
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 1
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 1
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 0
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 0
+        # Verify internal state
+        assert len(output_processor.lora_states.requests) == 2
+        assert "lora-1" in output_processor.lora_states.requests
+        assert "lora-2" in output_processor.lora_states.requests
+    else:
+        # When log_stats=False, no tracking should occur
+        assert iteration_stats is None
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Second iteration: process outputs with SCHEDULED events
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    for output in outputs.outputs:
+        output.events = [
+            EngineCoreEvent.new_event(
+                EngineCoreEventType.SCHEDULED, engine_core_timestamp
+            )
+        ]
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # Verify running counts
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 0
+        assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 0
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 1
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1
+    else:
+        assert iteration_stats is None
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Third iteration: finish request-0 (lora-1)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-0 as finished (it uses lora-1)
+    for output in outputs.outputs:
+        if output.request_id == "request-0":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # lora-1 should be removed since no requests remain
+        assert "lora-1" not in output_processor.lora_states.requests
+        # lora-2 should still be running
+        assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1
+        assert len(output_processor.lora_states.requests) == 1
+    else:
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Fourth iteration: finish request-1 (lora-2)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-1 as finished (it uses lora-2)
+    for output in outputs.outputs:
+        if output.request_id == "request-1":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    if log_stats:
+        # lora-2 should be removed since no requests remain
+        assert "lora-2" not in output_processor.lora_states.requests
+        assert len(outputs.scheduler_stats.running_lora_adapters) == 0
+        assert len(output_processor.lora_states.requests) == 0
+    else:
+        assert len(output_processor.lora_states.requests) == 0
+
+    # Finish the last request (no LoRA)
+    outputs = EngineCoreOutputs(
+        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
+    )
+    # Find and mark request-2 as finished (it has no LoRA)
+    for output in outputs.outputs:
+        if output.request_id == "request-2":
+            output.finish_reason = FinishReason.LENGTH
+            break
+
+    iteration_stats = IterationStats() if log_stats else None
+    output_processor.process_outputs(
+        outputs.outputs, engine_core_timestamp, iteration_stats
+    )
+    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+    # Verify all requests are finished
+    assert output_processor.get_num_unfinished_requests() == 0
+
+
 @pytest.mark.asyncio
 async def test_request_output_collector():
     NUM_REQS = 3
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 014e6eca2e02f..4cd26e7b41d3a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices(
     assert output is not None and isinstance(output, RequestOutput)
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text])
-    print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}")
+    reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
+    print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
 
-    assert content is not None and reasoning_content is not None
+    assert content is not None and reasoning is not None
     output_json = json.loads(content)
     jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
@@ -868,11 +868,8 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
 
 @pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
 def test_structured_output_with_structural_tag(
-    monkeypatch: pytest.MonkeyPatch,
     guided_decoding_backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
         guided_decoding_backend=guided_decoding_backend,
diff --git a/tests/v1/entrypoints/openai/responses/__init__.py b/tests/v1/entrypoints/openai/serving_responses/__init__.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/__init__.py
rename to tests/v1/entrypoints/openai/serving_responses/__init__.py
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/serving_responses/conftest.py
similarity index 84%
rename from tests/v1/entrypoints/openai/responses/conftest.py
rename to tests/v1/entrypoints/openai/serving_responses/conftest.py
index 032ed42f43d1b..8081e5fa1d837 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@@ -15,8 +15,13 @@ def default_server_args():
         "--max-model-len",
         "8192",
         "--enforce-eager",  # For faster startup.
+        "--enable-auto-tool-choice",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
         "--reasoning-parser",
-        "deepseek_r1",
+        "qwen3",
     ]
 
 
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/serving_responses/test_basic.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/test_basic.py
rename to tests/v1/entrypoints/openai/serving_responses/test_basic.py
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
new file mode 100644
index 0000000000000..cf57956a9dea7
--- /dev/null
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+
+MODEL_NAME = "Qwen/Qwen3-1.7B"
+tools = [
+    {
+        "type": "function",
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to find the weather for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+                "options": {
+                    "$ref": "#/$defs/WeatherOptions",
+                    "description": "Optional parameters for weather query",
+                },
+            },
+            "required": ["country", "unit"],
+            "$defs": {
+                "WeatherOptions": {
+                    "title": "WeatherOptions",
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "default": "celsius",
+                            "description": "Temperature unit",
+                            "title": "Temperature Unit",
+                        },
+                        "include_forecast": {
+                            "type": "boolean",
+                            "default": False,
+                            "description": "Whether to include a 24-hour forecast",
+                            "title": "Include Forecast",
+                        },
+                        "language": {
+                            "type": "string",
+                            "default": "zh-CN",
+                            "description": "Language of the response",
+                            "title": "Language",
+                            "enum": ["zh-CN", "en-US", "ja-JP"],
+                        },
+                    },
+                },
+            },
+        },
+    },
+    {
+        "type": "function",
+        "name": "get_forecast",
+        "description": "Get the weather forecast for a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to get the forecast for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "days": {
+                    "type": "integer",
+                    "description": "Number of days to get the forecast for (1-7)",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+            "required": ["country", "days", "unit"],
+        },
+    },
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI, model_name: str, tool_choice: str
+):
+    prompt = [
+        {
+            "role": "user",
+            "content": "Can you tell me what the current weather is in Berlin and the "
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+    response = await client.responses.create(
+        model=model_name,
+        input=prompt,
+        tools=tools,
+        tool_choice=tool_choice,
+    )
+
+    assert len(response.output) >= 1
+    tool_call = None
+    reasoning = None
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+        if out.type == "reasoning":
+            reasoning = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert json.loads(tool_call.arguments) is not None
+    assert reasoning is not None
+    assert reasoning.type == "reasoning"
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(client: openai.AsyncOpenAI):
+    def get_weather(latitude: float, longitude: float) -> str:
+        """
+        Mock function to simulate getting weather data.
+        In a real application, this would call an external weather API.
+        """
+        return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": (
+                "Get current temperature for provided coordinates in celsius."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "latitude": {"type": "number"},
+                    "longitude": {"type": "number"},
+                },
+                "required": ["latitude", "longitude"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    input_messages = [
+        {"role": "user", "content": "What's the weather like in Paris today?"}
+    ]
+
+    response = await client.responses.create(
+        model=MODEL_NAME,
+        input=input_messages,
+        tools=tools,
+        tool_choice={"type": "function", "name": "get_weather"},
+    )
+    assert len(response.output) >= 1
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert tool_call.name == "get_weather"
+    args = json.loads(tool_call.arguments)
+    assert args["latitude"] is not None
+    assert args["longitude"] is not None
+    # call the tool
+    result = get_weather(args["latitude"], args["longitude"])
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    # create a new response with the tool call result
+    response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
+    # check the output
+    assert len(response_2.output_text) > 0
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/test_image.py
rename to tests/v1/entrypoints/openai/serving_responses/test_image.py
diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/test_stateful.py
rename to tests/v1/entrypoints/openai/serving_responses/test_stateful.py
diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
similarity index 100%
rename from tests/v1/entrypoints/openai/responses/test_structured_output.py
rename to tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
index 56574124b2727..e9f635378e577 100644
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -4,10 +4,12 @@
 import asyncio
 import os
 from collections.abc import Callable
+from concurrent.futures import Future
 from typing import Any
 
 import pytest
 
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncLLM
@@ -27,12 +29,19 @@ class CustomMultiprocExecutor(MultiprocExecutor):
         kwargs: dict | None = None,
         non_block: bool = False,
         unique_reply_rank: int | None = None,
-    ) -> list[Any]:
+        kv_output_aggregator: KVOutputAggregator = None,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
         # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
         return super().collective_rpc(
-            method, timeout, args, kwargs, non_block, unique_reply_rank
+            method,
+            timeout,
+            args,
+            kwargs,
+            non_block,
+            unique_reply_rank,
+            kv_output_aggregator,
         )
 
 
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/generation/test_batch_invariance.py
index f05fac2478d8a..8fd038bca5d0f 100644
--- a/tests/v1/generation/test_batch_invariance.py
+++ b/tests/v1/generation/test_batch_invariance.py
@@ -456,7 +456,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         model=model,
         max_num_seqs=1,
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-        enforce_eager=True,
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
@@ -998,7 +997,6 @@ def LLM_with_max_seqs(
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
         enable_prefix_caching=False,
-        enforce_eager=True,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index a756858e2cc51..a9817313cf022 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -136,7 +136,6 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --disable-hybrid-kv-cache-manager \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
@@ -179,7 +178,6 @@ run_tests_for_model() {
     --port $PORT \
     --enforce-eager \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
-    --disable-hybrid-kv-cache-manager \
     --kv-transfer-config '$KV_CONFIG'"
   
   # DP-EP attention mode
diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
index a3eeedb2e5146..c48b452e24cd4 100755
--- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
@@ -85,7 +85,6 @@ run_tests_for_model() {
   --port $PREFILL_PORT \
   --enforce-eager \
   --gpu-memory-utilization 0.2 \
-  --disable-hybrid-kv-cache-manager \
   --kv-transfer-config '$KV_CONFIG'"
 
   if [ -n "$model_args" ]; then
@@ -104,7 +103,6 @@ run_tests_for_model() {
   --port $DECODE_PORT \
   --enforce-eager \
   --gpu-memory-utilization 0.2 \
-  --disable-hybrid-kv-cache-manager \
   --kv-transfer-config '$KV_CONFIG'"
 
   if [ -n "$model_args" ]; then
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
new file mode 100644
index 0000000000000..f51001a6ec12a
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for backwards compatibility with external KV connector implementations.
+
+This test ensures that external connectors (loaded via kv_connector_module_path)
+implemented with the old signature continue to work:
+- Old signature: __init__(self, vllm_config, role)
+- New signature: __init__(self, vllm_config, role, kv_cache_config)
+"""
+
+from typing import TYPE_CHECKING
+from unittest.mock import patch
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+    KVConnectorBase_V1,
+    KVConnectorRole,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+
+from .utils import create_scheduler, create_vllm_config
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+
+class OldStyleTestConnector(KVConnectorBase_V1):
+    """
+    Test connector using the old signature with 2 required arguments.
+    This simulates external connectors that haven't been updated yet.
+    """
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        # Old-style call to super().__init__ with only 2 arguments
+        super().__init__(vllm_config=vllm_config, role=role)
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int | None, bool]:
+        return 0, False
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        blocks: "KVCacheBlocks",
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(self, scheduler_output: SchedulerOutput):
+        return None
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+class NewStyleTestConnector(KVConnectorBase_V1):
+    """
+    Test connector using the new signature with 3 required arguments.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        # New-style call to super().__init__ with all 3 arguments
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
+
+    def get_num_new_matched_tokens(
+        self, request: "Request", num_computed_tokens: int
+    ) -> tuple[int | None, bool]:
+        return 0, False
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        blocks: "KVCacheBlocks",
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(self, scheduler_output: SchedulerOutput):
+        return None
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
+        pass
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        pass
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer,
+        attn_metadata: "AttentionMetadata",
+        **kwargs,
+    ) -> None:
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_external_old_signature_factory_instantiation(role):
+    """
+    Test that external connectors with old signature (2 required args) loaded
+    via kv_connector_module_path are correctly instantiated with backwards
+    compatibility support.
+    """
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "OldStyleTestConnector"
+    vllm_config.kv_transfer_config.kv_connector_module_path = (
+        "tests.v1.kv_connector.unit.test_backwards_compatibility"
+    )
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
+
+    assert connector is not None
+    assert isinstance(connector, OldStyleTestConnector)
+    assert connector.role == role
+    assert connector._kv_cache_config is None
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_external_new_signature_factory_instantiation(role):
+    """
+    Test that external connectors with new signature (3 required args) loaded
+    via kv_connector_module_path are correctly instantiated.
+    """
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "NewStyleTestConnector"
+    vllm_config.kv_transfer_config.kv_connector_module_path = (
+        "tests.v1.kv_connector.unit.test_backwards_compatibility"
+    )
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config)
+
+    assert connector is not None
+    assert isinstance(connector, NewStyleTestConnector)
+    assert connector.role == role
+    assert connector._kv_cache_config is not None
+    assert connector._kv_cache_config == kv_cache_config
+
+
+@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER])
+def test_old_signature_super_init(role):
+    """
+    Test that old-style connectors can call super().__init__() without
+    kv_cache_config parameter.
+    """
+    vllm_config = create_vllm_config()
+
+    connector = OldStyleTestConnector(vllm_config, role)
+
+    assert connector is not None
+    assert connector.role == role
+    assert connector._kv_cache_config is None
+
+
+def test_old_signature_super_init_with_kwargs():
+    """
+    Test that old-style connectors can call super().__init__() with keyword
+    arguments in different orders.
+    """
+    vllm_config = create_vllm_config()
+
+    # Test with vllm_config= and role= kwargs
+    connector1 = OldStyleTestConnector(
+        vllm_config=vllm_config, role=KVConnectorRole.SCHEDULER
+    )
+    assert connector1 is not None
+    assert connector1._kv_cache_config is None
+
+    # Test with role= and vllm_config= in reversed order
+    connector2 = OldStyleTestConnector(
+        role=KVConnectorRole.WORKER, vllm_config=vllm_config
+    )
+    assert connector2 is not None
+    assert connector2._kv_cache_config is None
+
+
+def test_internal_connector_uses_new_signature():
+    """
+    Test that internal connectors (registered in factory) always use the new
+    signature and get kv_cache_config.
+    """
+    from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
+        SharedStorageConnector,
+    )
+
+    vllm_config = create_vllm_config()
+    vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector"
+
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    connector = KVConnectorFactory.create_connector(
+        vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+    )
+
+    assert connector is not None
+    assert isinstance(connector, SharedStorageConnector)
+    assert connector._kv_cache_config is not None
+    assert connector._kv_cache_config == kv_cache_config
+
+
+def test_signature_detection_with_mocking():
+    """
+    Test that the factory correctly applies compat_sig flag returned from
+    _get_connector_class_with_compat.
+    """
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+    kv_cache_config = scheduler.kv_cache_config
+
+    # Mock _get_connector_class_with_compat to return old-style connector
+    with patch.object(
+        KVConnectorFactory,
+        "_get_connector_class_with_compat",
+        return_value=(OldStyleTestConnector, True),
+    ):
+        old_connector = KVConnectorFactory.create_connector(
+            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+        )
+        assert old_connector is not None
+        assert isinstance(old_connector, OldStyleTestConnector)
+        assert old_connector._kv_cache_config is None
+
+    # Mock _get_connector_class_with_compat to return new-style connector
+    with patch.object(
+        KVConnectorFactory,
+        "_get_connector_class_with_compat",
+        return_value=(NewStyleTestConnector, False),
+    ):
+        new_connector = KVConnectorFactory.create_connector(
+            vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config
+        )
+        assert new_connector is not None
+        assert isinstance(new_connector, NewStyleTestConnector)
+        assert new_connector._kv_cache_config is not None
+        assert new_connector._kv_cache_config == kv_cache_config
diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
new file mode 100644
index 0000000000000..11507d7cd4e7b
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# NOTE: if your PR has broken one of the tests here (sorry),
+# kindly patch the corresponding integration in
+# /vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+# or reach out to @aposataC for assistance
+
+# Assumption vs. Correctness Tests:
+# these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
+# it is to ensure that assumptions LMCache makes about vLLM's interface are stable
+def assumes(obj, attr, is_callable=False, is_instance_of=None):
+    import inspect
+    from dataclasses import is_dataclass
+
+    assumption_msg = (
+        f"LMCache connector currently assumes that {obj} has a(n) {attr} attribute"
+    )
+    if hasattr(obj, attr):
+        attr_value = getattr(obj, attr)
+    elif is_dataclass(obj) and attr in getattr(obj, "__dataclass_fields__", {}):
+        field = obj.__dataclass_fields__[attr]
+        field_type = field.type
+        origin = getattr(field_type, "__origin__", None)
+        if origin is not None:
+            field_type = origin
+        attr_value = field_type
+    else:
+        raise AssertionError(assumption_msg)
+    if is_callable:
+        assumption_msg += f" and that {obj}.{attr} is a callable"
+        assert callable(attr_value), assumption_msg
+    if is_instance_of:
+        assumption_msg += f" and that {obj}.{attr} is an instance of {is_instance_of}"
+        if isinstance(attr_value, property):
+            fget = attr_value.fget
+            assert fget is not None, f"Property {obj}.{attr} has no fget"
+            sig = inspect.signature(fget)
+            ret_anno = sig.return_annotation
+            assert ret_anno is not inspect._empty, (
+                f"Property {obj}.{attr} has no return annotation"
+            )
+            assert ret_anno == is_instance_of, assumption_msg
+        else:
+            if isinstance(attr_value, type):
+                assert attr_value is is_instance_of, assumption_msg
+            else:
+                assert isinstance(attr_value, is_instance_of), assumption_msg
+
+
+def test_multimodal_interface():
+    # protect against interface changes
+    from vllm.multimodal.inputs import PlaceholderRange
+
+    assumes(PlaceholderRange, "offset")
+    assumes(PlaceholderRange, "length")
+
+    # test a minimal case
+    import torch
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        apply_mm_hashes_to_token_ids,
+    )
+
+    token_ids = torch.arange(10, dtype=torch.long)
+    mm_hashes = ["0000", "1111"]  # hex repr of 0 and 4369
+    mm_positions = [
+        PlaceholderRange(offset=0, length=4),
+        PlaceholderRange(offset=5, length=4),
+    ]
+    apply_mm_hashes_to_token_ids(token_ids, mm_hashes, mm_positions)
+    assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
+
+
+def test_config_interface():
+    # protect against interface changes
+    from vllm.config import VllmConfig
+    from vllm.config.cache import CacheConfig
+    from vllm.config.kv_transfer import KVTransferConfig
+    from vllm.config.model import ModelConfig
+    from vllm.config.parallel import ParallelConfig
+
+    assumes(VllmConfig, "model_config")
+    assumes(VllmConfig, "cache_config")
+    assumes(VllmConfig, "parallel_config")
+    assumes(VllmConfig, "kv_transfer_config")
+
+    assumes(KVTransferConfig, "kv_role")
+    assumes(KVTransferConfig, "kv_connector_extra_config")
+
+    assumes(ModelConfig, "use_mla", is_instance_of=bool)
+    assumes(ModelConfig, "dtype")
+    assumes(ModelConfig, "max_model_len")
+    assumes(ModelConfig, "get_vocab_size", is_callable=True)
+    assumes(ModelConfig, "get_num_attention_heads", is_callable=True)
+    assumes(ModelConfig, "get_num_kv_heads", is_callable=True)
+    assumes(ModelConfig, "get_head_size", is_callable=True)
+    assumes(ModelConfig, "get_num_layers", is_callable=True)
+    assumes(ModelConfig, "get_num_kv_heads", is_callable=True)
+    assumes(ModelConfig, "model")
+
+    assumes(ParallelConfig, "world_size")
+    assumes(ParallelConfig, "rank")
+    assumes(ParallelConfig, "tensor_parallel_size")
+    assumes(ParallelConfig, "pipeline_parallel_size")
+    assumes(ParallelConfig, "data_parallel_size_local")
+    assumes(ParallelConfig, "data_parallel_rank_local")
+
+    assumes(CacheConfig, "cache_dtype")
+    assumes(CacheConfig, "block_size")
+    assumes(CacheConfig, "gpu_memory_utilization")
+
+    # mla metadata minimal cases
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        mla_enabled,
+    )
+
+    model_config = ModelConfig(model="deepseek-ai/DeepSeek-R1")
+    assert mla_enabled(model_config)
+    model_config = ModelConfig(model="Qwen/Qwen3-0.6B")
+    assert not mla_enabled(model_config)
+
+    # kv metadata minimal case
+    from vllm.utils.torch_utils import get_kv_cache_torch_dtype
+
+    model_config = ModelConfig(dtype="bfloat16")
+    parallel_config = ParallelConfig()
+    cache_config = CacheConfig(cache_dtype="bfloat16")
+    kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype)
+    use_mla = mla_enabled(model_config)
+    chunk_size = 256
+    num_layer = model_config.get_num_layers(parallel_config)
+    num_kv_head = model_config.get_num_kv_heads(parallel_config)
+    head_size = model_config.get_head_size()
+    kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size)
+
+    # dummy lmcache metadata creation example
+    _ = (
+        model_config.model,
+        parallel_config.world_size,
+        parallel_config.rank,
+        "vllm",
+        kv_dtype,
+        kv_shape,
+        use_mla,
+    )
+
+
+def test_request_interface():
+    # protect against interface changes
+    from types import NoneType
+
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+    req = Request(
+        request_id="test_request",
+        prompt_token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+    )
+    assumes(req, "mm_features", is_instance_of=(list, NoneType))
+    assumes(req, "request_id")
+    assumes(req, "priority")
+    assumes(req, "prompt_token_ids")
+    assumes(req, "sampling_params")
+    assumes(req, "num_tokens")
+    assumes(req, "kv_transfer_params", is_instance_of=(dict, NoneType))
+
+    from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
+
+    assumes(MultiModalFeatureSpec, "identifier")
+    assumes(MultiModalFeatureSpec, "mm_position")
+
+    # minimal case:
+    from vllm.multimodal.inputs import PlaceholderRange
+
+    request = Request(
+        request_id="test_request",
+        prompt_token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        pooling_params=None,
+        eos_token_id=100,
+        lora_request=None,
+        mm_features=[
+            MultiModalFeatureSpec(
+                modality="image",
+                identifier="0000",
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=PlaceholderRange(offset=0, length=10),
+            )
+        ],
+    )
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import (
+        extract_mm_features,
+    )
+
+    mm_hashes, mm_positions = extract_mm_features(request)
+    assert isinstance(mm_hashes, list)
+    assert len(mm_hashes) == 1
+    assert isinstance(mm_positions, list)
+    assert len(mm_positions) == 1
+    assert mm_positions[0].offset == 0
+    assert mm_positions[0].length == 10
+
+
+def test_new_request_interface():
+    # protect against interface changes
+    from vllm.v1.core.sched.output import NewRequestData
+
+    assumes(NewRequestData, "req_id")
+    assumes(NewRequestData, "block_ids")
+    assumes(NewRequestData, "prompt_token_ids")
+    assumes(NewRequestData, "sampling_params")
+
+
+def test_sampling_params_interface():
+    # protect against interface changes
+    from vllm.sampling_params import SamplingParams
+
+    assumes(SamplingParams, "extra_args")
+
+    # dumb example use case in LMCache
+    kv_transfer_params = {
+        "lmcache.tag.user": "example_user_1",
+        "lmcache.ttl": 60,
+    }
+    sampling_params = SamplingParams(
+        extra_args={"kv_transfer_params": kv_transfer_params}
+    )
+    assert sampling_params.extra_args["kv_transfer_params"] == kv_transfer_params
+
+
+def test_tp_interface():
+    # protect against interface changes
+    import inspect
+
+    from vllm.distributed.parallel_state import get_tp_group
+
+    sig = inspect.signature(get_tp_group)
+    GroupCoordinator = sig.return_annotation
+
+    assumes(GroupCoordinator, "broadcast", is_callable=True)
+    assumes(GroupCoordinator, "broadcast_object", is_callable=True)
+
+
+def test_forward_context_interface():
+    # protect against interface changes
+    from vllm.forward_context import ForwardContext
+
+    assumes(ForwardContext, "no_compile_layers", is_instance_of=dict)
+    assumes(ForwardContext, "virtual_engine")
+    assumes(ForwardContext, "attn_metadata")
+
+
+def test_scheduler_output_interface():
+    # protect against interface changes
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+    assumes(SchedulerOutput, "finished_req_ids")
+    assumes(SchedulerOutput, "scheduled_new_reqs", is_instance_of=list)
+    assumes(SchedulerOutput, "num_scheduled_tokens", is_instance_of=dict)
+    assumes(SchedulerOutput, "scheduled_cached_reqs")
+
+    from vllm.v1.core.sched.output import CachedRequestData
+
+    assumes(CachedRequestData, "req_ids", is_instance_of=list)
+    assumes(CachedRequestData, "new_block_ids", is_instance_of=list)
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 6748532afd971..1c1ac915c758e 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -114,7 +114,6 @@ def test_multi_shared_storage_connector_consistency():
         enforce_eager=True,
         gpu_memory_utilization=0.5,
         kv_transfer_config=kv_transfer_config,
-        disable_hybrid_kv_cache_manager=True,
     )
     # Run generation - this should trigger saving KV cache
     _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 1f3fdafc644d8..475cf2285e394 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1020,7 +1020,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
         "gpu_memory_utilization": 0.5,
         "kv_transfer_config": kv_transfer_config,
         "distributed_executor_backend": distributed_executor_backend,
-        "disable_hybrid_kv_cache_manager": True,
     }
 
     timeout = 6
diff --git a/tests/v1/kv_connector/unit/test_output_aggregator.py b/tests/v1/kv_connector/unit/test_output_aggregator.py
index 4dba203ebc7d8..b083ccef9819b 100644
--- a/tests/v1/kv_connector/unit/test_output_aggregator.py
+++ b/tests/v1/kv_connector/unit/test_output_aggregator.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from concurrent.futures import Future
 
 import pytest
 
@@ -86,82 +85,6 @@ def test_aggregate_workers_output():
     assert aggregated.invalid_block_ids == {3, 4, 5}
 
 
-def test_async_aggregate_workers_output():
-    aggregator = KVOutputAggregator(expected_finished_count=2)
-
-    future1: Future[DummyModelRunnerOutput] = Future()
-    future2: Future[DummyModelRunnerOutput] = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput()
-    output2 = DummyModelRunnerOutput()
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-    assert not aggregated.invalid_block_ids
-
-    future1 = Future()
-    future2 = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(
-        finished_sending={"req1"}, finished_recving={"req2"}
-    )
-    output2 = DummyModelRunnerOutput(invalid_block_ids={1})
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-    assert aggregated.invalid_block_ids == {1}
-
-    future1 = Future()
-    future2 = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(invalid_block_ids={2})
-    output2 = DummyModelRunnerOutput(finished_sending={"req1"})
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending == {"req1"}
-    assert aggregated.finished_recving is None
-    assert aggregated.invalid_block_ids == {2}
-
-    future1 = Future()
-    future2 = Future()
-    result_future = aggregator.async_aggregate([future1, future2])
-
-    output1 = DummyModelRunnerOutput(invalid_block_ids={3, 4})
-    output2 = DummyModelRunnerOutput(
-        finished_recving={"req2"}, invalid_block_ids={4, 5}
-    )
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    aggregated = aggregated.kv_connector_output
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {"req2"}
-    assert aggregated.invalid_block_ids == {3, 4, 5}
-
-
 def test_aggregate_workers_output_with_expected_finished_count():
     # We create the aggregator expecting to collect from 4 workers
     aggregator = KVOutputAggregator(expected_finished_count=4)
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index 6040ed5a6806d..e7013a794a8c6 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -132,7 +132,6 @@ def test_shared_storage_connector_hashes(tmp_path):
         enforce_eager=True,
         kv_transfer_config=kv_transfer_config,
         limit_mm_per_prompt={"image": 2},
-        disable_hybrid_kv_cache_manager=True,
     )
 
     # don't put this import at the top level
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 46ea46e53084e..f0031643aa9d4 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -91,9 +91,6 @@ def create_vllm_config(
         max_num_batched_tokens=max_num_batched_tokens,
         max_model_len=max_model_len,
         enable_chunked_prefill=enable_chunked_prefill,
-        # Disable hybrid KV cache manager for testing
-        # Should be removed after we support hybrid KV cache manager-based testing.
-        disable_hybrid_kv_cache_manager=True,
     )
     model_config = ModelConfig(
         model=model,
@@ -254,7 +251,7 @@ def create_model_runner_output(
 
 
 class TestSharedStorageConnector(SharedStorageConnector):
-    def __init__(self, config: VllmConfig, role):
+    def __init__(self, config: VllmConfig, role, kv_cache_config):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
         self._connector = SharedStorageConnector(config, role)
         self.call_record: dict[str, int] = defaultdict(int)
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 0d902b46bed5a..3e0bb02ed68be 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -177,3 +177,32 @@ async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
 
         # Alternate whether to activate dummy logitproc for each request
         use_dummy_logitproc = not use_dummy_logitproc
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_invalid_custom_logitsproc_arg(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that request with invalid custom logitsproc is rejected"""
+
+    prompt = "Hello, my name is"
+    # Pass invalid (non-int) target_token value to dummy logits processor
+    request_keyword_args: dict[str, Any] = {
+        **api_keyword_args,
+        "extra_body": {
+            "vllm_xargs": {DUMMY_LOGITPROC_ARG: "invalid_target_token_value"}
+        },
+    }
+
+    with pytest.raises(openai.OpenAIError) as exc_info:
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **request_keyword_args,
+        )
+
+    assert "is not int" in str(exc_info.value)
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index 36cffebb3b457..b8548bc319554 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -52,6 +52,16 @@ prompts = [
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
 
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        target_token: int | None = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is not None and not isinstance(target_token, int):
+            raise ValueError(
+                f"target_token value {target_token} {type(target_token)} is not int"
+            )
+
     def __init__(
         self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
     ):
@@ -62,11 +72,14 @@ class DummyLogitsProcessor(LogitsProcessor):
         return False
 
     def update_state(self, batch_update: BatchUpdate | None):
+        def extract_extra_arg(params: SamplingParams) -> int | None:
+            self.validate_params(params)
+            return params.extra_args and params.extra_args.get("target_token")
+
         process_dict_updates(
             self.req_info,
             batch_update,
-            lambda params, _, __: params.extra_args
-            and (params.extra_args.get("target_token")),
+            lambda params, _, __: extract_extra_arg(params),
         )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py
index 67a2d1739b6bb..48067def8357e 100644
--- a/tests/v1/metrics/test_stats.py
+++ b/tests/v1/metrics/test_stats.py
@@ -5,19 +5,4 @@ from vllm.v1.metrics.stats import IterationStats
 
 def test_iteration_stats_repr():
     iteration_stats = IterationStats()
-    iteration_stats.iteration_timestamp = 0
-    expected_repr = (
-        "IterationStats("
-        "iteration_timestamp=0, "
-        "num_generation_tokens=0, "
-        "num_prompt_tokens=0, "
-        "num_preempted_reqs=0, "
-        "finished_requests=[], "
-        "max_num_generation_tokens_iter=[], "
-        "n_params_iter=[], "
-        "time_to_first_tokens_iter=[], "
-        "inter_token_latencies_iter=[], "
-        "waiting_lora_adapters={}, "
-        "running_lora_adapters={})"
-    )
-    assert repr(iteration_stats) == expected_repr
+    assert repr(iteration_stats).startswith("IterationStats(")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 6d4a1ecf78c82..354fff22dc2ac 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -530,7 +530,6 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
 def test_spec_decode_logprobs(
     logprobs_mode: LogprobsMode,
     model_setup: tuple[str, str, str],
-    monkeypatch: pytest.MonkeyPatch,
 ):
     """Spec decode logprobs should match those of the base model.
 
@@ -541,64 +540,62 @@ def test_spec_decode_logprobs(
     """
     from vllm import LLM
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        prompt = "Hello world"
-        sampling_params = SamplingParams(
-            temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
-        )
-        method, model_name, spec_model_name = model_setup
-        max_model_len = 256
+    prompt = "Hello world"
+    sampling_params = SamplingParams(
+        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
+    )
+    method, model_name, spec_model_name = model_setup
+    max_model_len = 256
 
-        # Run base LLM.
-        ref_llm = LLM(
-            model=model_name,
-            max_logprobs=5,
-            max_model_len=max_model_len,
-            seed=42,
-            logprobs_mode=logprobs_mode,
-            gpu_memory_utilization=0.4,
-        )
-        ref_results = ref_llm.generate([prompt], sampling_params)
-        # Collect logprobs outputs from reference LLM.
-        ref_logprobs = []
-        for output in ref_results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    ref_logprobs.append(logprobs[token_id])
-        del ref_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    ref_results = ref_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from reference LLM.
+    ref_logprobs = []
+    for output in ref_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                ref_logprobs.append(logprobs[token_id])
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
 
-        # Run spec decode LLM.
-        spec_llm = LLM(
-            model_name,
-            speculative_config={
-                "method": method,
-                "model": spec_model_name,
-                "num_speculative_tokens": 3,
-                "max_model_len": max_model_len,
-            },
-            max_logprobs=5,
-            max_model_len=max_model_len,
-            seed=42,
-            logprobs_mode=logprobs_mode,
-            gpu_memory_utilization=0.4,
-        )
-        spec_results = spec_llm.generate([prompt], sampling_params)
-        # Collect logprobs outputs from spec decode LLM.
-        spec_logprobs = []
-        for output in spec_results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    spec_logprobs.append(logprobs[token_id])
-        del spec_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
+    # Run spec decode LLM.
+    spec_llm = LLM(
+        model_name,
+        speculative_config={
+            "method": method,
+            "model": spec_model_name,
+            "num_speculative_tokens": 3,
+            "max_model_len": max_model_len,
+        },
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    spec_results = spec_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from spec decode LLM.
+    spec_logprobs = []
+    for output in spec_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                spec_logprobs.append(logprobs[token_id])
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
 
-        # Per-token logprobs are expected to be the same.
-        assert len(ref_logprobs) == len(spec_logprobs)
-        for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
-            assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
-            assert ref_logprob.rank == spec_logprob.rank
-            assert ref_logprob.decoded_token == spec_logprob.decoded_token
+    # Per-token logprobs are expected to be the same.
+    assert len(ref_logprobs) == len(spec_logprobs)
+    for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
+        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert ref_logprob.rank == spec_logprob.rank
+        assert ref_logprob.decoded_token == spec_logprob.decoded_token
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index bc779f6bd9c4d..fa1d0437f7c71 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -7,6 +7,7 @@ import pytest
 from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
+from vllm.sampling_params import StructuredOutputsParams
 
 _PROMPTS = [
     "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -56,8 +57,34 @@ def test_eagle_max_len(
                 "method": "eagle",
                 "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
                 "num_speculative_tokens": num_speculative_tokens,
+                "max_model_len": 80,
             },
-            max_model_len=100,
+            max_model_len=200,
         )
-        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
-        llm.generate(_PROMPTS, sampling_params)
+        sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
+        outputs = llm.generate(_PROMPTS, sampling_params)
+        for o in outputs:
+            assert o.outputs[0].finish_reason == "length", (
+                "This test is only meaningful if the output "
+                "is truncated due to max length"
+            )
+
+        sampling_params = SamplingParams(
+            max_tokens=200,
+            structured_outputs=StructuredOutputsParams(
+                regex="^" + "a b c d e " * 15 + "$"
+            ),
+        )
+        output = llm.generate(_PROMPTS, sampling_params)
+        for o in output:
+            assert o.prompt_token_ids is not None
+            assert (
+                len(o.prompt_token_ids)
+                < 80
+                < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
+                < 200
+            ), (
+                "This test is only meaningful if the output "
+                "is longer than the eagle max length"
+            )
+            assert o.outputs[0].text == "a b c d e " * 15
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
new file mode 100644
index 0000000000000..771076186a3b4
--- /dev/null
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoTokenizer
+
+from vllm.config import StructuredOutputsConfig, VllmConfig
+from vllm.config.model import ModelConfig
+from vllm.config.speculative import SpeculativeConfig
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import StructuredOutputOptions
+
+TOKENIZER = "gpt2"
+
+
+def test_backend_guidance_rollback_terminated():
+    # Test that the backend guidance successfully rollbacks from a
+    # terminated state. This can happen with speculative decoding,
+    # where the draft model proposes EOS and it is verified by the
+    # guidance backend. In that case we are in a stopped state, but
+    # it should be reverted in case EOS is not accepted by the target
+    # model.
+    vllm_config = VllmConfig(
+        decoding_config=StructuredOutputsConfig(
+            backend="guidance",
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+
+    backend = GuidanceBackend(
+        vllm_config,
+        tokenizer=tokenizer,
+        vocab_size=50257,
+    )
+
+    grammar = backend.compile_grammar(
+        StructuredOutputOptions.JSON, '{"type": "object"}'
+    )
+
+    prompt = tokenizer.encode('{"a": "b"}')
+    assert len(prompt) > 1
+    dummy_wrong = tokenizer.encode('{"a"}')
+    for token in prompt:
+        assert grammar.accept_tokens("", [token])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Giving any other token should also be accepted
+    assert grammar.accept_tokens("", dummy_wrong)
+    # Rollback is done from where state was terminated, so from '}' not EOS
+    grammar.rollback(len(prompt) - 1)
+    assert not grammar.is_terminated()
+    assert grammar.validate_tokens([tokenizer.eos_token_id]) == []
+    assert grammar.validate_tokens(dummy_wrong) != dummy_wrong
+    assert grammar.accept_tokens("", prompt[1:])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Rollback of <= 0 should not change the terminated state
+    grammar.rollback(0)
+    assert grammar.is_terminated()
+    grammar.rollback(-1)
+    assert grammar.is_terminated()
+
+
+def test_grammar_bitmask_with_specdec():
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+    prompt = tokenizer.encode('{"a": "b"}')
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(tokenizer=TOKENIZER),
+        structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
+        speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
+    )
+    structured_output_manager = StructuredOutputManager(vllm_config)
+
+    for i in range(1, 2):
+        sampling_params = SamplingParams(
+            structured_outputs=StructuredOutputsParams(
+                json='{"type": "object"}',
+            ),
+        )
+        sampling_params.structured_outputs._backend = "guidance"
+
+        my_req_id = f"my_req_id_{i}"
+        request = Request(
+            my_req_id,
+            prompt_token_ids=prompt[:i],
+            sampling_params=sampling_params,
+            pooling_params=None,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+        structured_output_manager.grammar_init(request)
+
+        def grammar_bitmask(req: Request, tokens: list[int]) -> None:
+            structured_output_manager.grammar_bitmask(
+                requests={req.request_id: req},
+                structured_output_request_ids={req.request_id: 0},
+                scheduled_spec_decode_tokens={req.request_id: tokens},
+            )
+            # At this point, we rolled-back, so should not be terminated
+            assert not req.structured_output_request.grammar.is_terminated()
+
+        # The grammar might not yet be compiled, so we wait for it
+        while not request.structured_output_request._check_grammar_completion():
+            continue
+
+        assert request.structured_output_request.grammar.accept_tokens(
+            request.request_id, prompt[:i]
+        )
+
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
+        grammar_bitmask(
+            request, prompt[i:] + [tokenizer.eos_token_id] + prompt
+        )  # EOS not the final token
+        grammar_bitmask(request, prompt[i:])  # EOS not present
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 5d3bb924590ad..f989f0744166c 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -1,55 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-
 import pytest
 
-import vllm.envs as envs
-from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
 
-def test_reject_bad_config(monkeypatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-
-
-def test_unsupported_configs(monkeypatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                speculative_config={
-                    "model": MODEL,
-                },
-            ).create_engine_config()
-
-
-def test_enable_by_default_fallback(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Should default to V1 for supported config.
-        _ = AsyncEngineArgs(
+def test_unsupported_configs():
+    with pytest.raises(NotImplementedError):
+        AsyncEngineArgs(
             model=MODEL,
-            enforce_eager=True,
+            speculative_config={
+                "model": MODEL,
+            },
         ).create_engine_config()
-        assert envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
-
-def test_v1_llm_by_default(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Should default to V1 for supported config.
-        llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
-        print(llm.generate("Hello my name is"))
-        assert hasattr(llm.llm_engine, "engine_core")
-        m.delenv("VLLM_USE_V1")
diff --git a/tests/v1/test_outputs.py b/tests/v1/test_outputs.py
new file mode 100644
index 0000000000000..af9df844249ef
--- /dev/null
+++ b/tests/v1/test_outputs.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest import TestCase
+
+from vllm.v1.outputs import LogprobsLists
+
+
+class TestLogprobsLists(TestCase):
+    def setUp(self):
+        self.logprobsLists = LogprobsLists(
+            logprob_token_ids=[
+                [1, 2],  # Request 0 token 0
+                [3, 4],  # Request 0 token 1
+                [5, 6],  # Request 1 token 0
+                [7, 8],  # Request 1 token 1
+                [9, 10],  # Request 1 token 2
+                [11, 12],  # Request 2 token 0
+                [13, 14],  # Request 2 token 1
+                [15, 16],  # Request 2 token 2
+                [17, 18],  # Request 2 token 3
+            ],
+            logprobs=[
+                [0.1, 0.2],
+                [0.3, 0.4],
+                [0.5, 0.6],
+                [0.7, 0.8],
+                [0.9, 1.0],
+                [1.1, 1.2],
+                [1.3, 1.4],
+                [1.5, 1.6],
+                [1.7, 1.8],
+            ],
+            sampled_token_ranks=[1, 3, 5, 7, 9, 11, 13, 15, 17],
+            cu_num_generated_tokens=[0, 2, 5, 9],
+        )
+
+    def test_slice_without_cu_num_generated_tokens(self):
+        """Test slicing without cu_num_generated_tokens"""
+        logprobsLists = LogprobsLists(
+            logprob_token_ids=[[1], [2], [3]],
+            logprobs=[[0.1], [0.2], [0.3]],
+            sampled_token_ranks=[1, 2, 3],
+            cu_num_generated_tokens=None,
+        )
+
+        sliced = logprobsLists.slice(1, 3)
+        assert sliced.logprob_token_ids == [[2], [3]]
+        assert sliced.logprobs == [[0.2], [0.3]]
+        assert sliced.sampled_token_ranks == [2, 3]
+        assert sliced.cu_num_generated_tokens is None
+
+    def test_slice_from_start(self):
+        """Test slicing from the start position"""
+        sliced = self.logprobsLists.slice(0, 2)
+        assert len(sliced.logprob_token_ids) == 5
+        assert sliced.logprob_token_ids == [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [9, 10],
+        ]
+        assert sliced.cu_num_generated_tokens == [0, 2, 5]
+
+    def test_slice_from_middle(self):
+        """Test slicing from the middle position"""
+        sliced = self.logprobsLists.slice(1, 3)
+        assert len(sliced.logprob_token_ids) == 7
+        assert sliced.logprob_token_ids == [
+            [5, 6],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+        ]
+        assert sliced.cu_num_generated_tokens == [0, 3, 7]
+
+    def test_slice_single_request(self):
+        """Test slicing a single request"""
+        sliced = self.logprobsLists.slice(1, 2)
+        assert len(sliced.logprob_token_ids) == 3
+        assert sliced.logprob_token_ids == [[5, 6], [7, 8], [9, 10]]
+        assert sliced.cu_num_generated_tokens == [0, 3]
+
+    def test_slice_last_request(self):
+        """Test slicing the last request"""
+        sliced = self.logprobsLists.slice(2, 3)
+        assert len(sliced.logprob_token_ids) == 4
+        assert sliced.logprob_token_ids == [[11, 12], [13, 14], [15, 16], [17, 18]]
+        assert sliced.cu_num_generated_tokens == [0, 4]
+
+    def test_slice_all_requests(self):
+        """Test slicing all requests (full slice)"""
+        sliced = self.logprobsLists.slice(0, 3)
+        assert len(sliced.logprob_token_ids) == 9  # All tokens
+        assert sliced.logprob_token_ids == self.logprobsLists.logprob_token_ids
+        assert (
+            sliced.cu_num_generated_tokens == self.logprobsLists.cu_num_generated_tokens
+        )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index db0215511d322..bc624658308bf 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -951,6 +951,7 @@ def test_hybrid_block_table_initialization():
     max_num_reqs = 10
     max_num_blocks_per_req = 20
     max_num_batched_tokens = 512
+    dcp_kv_cache_interleave_size = 8
 
     block_table = BlockTable(
         block_size=block_size,
@@ -960,6 +961,7 @@ def test_hybrid_block_table_initialization():
         pin_memory=False,
         device=torch.device(DEVICE),
         kernel_block_size=kernel_block_sizes[0],
+        dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
     )
 
     # Verify hybrid block configuration
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 742aab6b0de75..4a20b6b7bb8fb 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -3,9 +3,11 @@
 # install_prerequisites.py
 import argparse
 import glob
+import json
 import os
 import subprocess
 import sys
+import urllib.request
 
 # --- Configuration ---
 WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache")
@@ -18,6 +20,20 @@ NIXL_REPO_URL = "https://github.com/ai-dynamo/nixl.git"
 
 
 # --- Helper Functions ---
+def get_latest_nixl_version():
+    """Helper function to get latest release version of NIXL"""
+    try:
+        nixl_release_url = "https://api.github.com/repos/ai-dynamo/nixl/releases/latest"
+        with urllib.request.urlopen(nixl_release_url) as response:
+            data = json.load(response)
+            return data.get("tag_name", "0.7.0")
+    except Exception:
+        return "0.7.0"
+
+
+NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())
+
+
 def run_command(command, cwd=".", env=None):
     """Helper function to run a shell command and check for errors."""
     print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True)
@@ -37,7 +53,7 @@ def is_pip_package_installed(package_name):
 def find_nixl_wheel_in_cache(cache_dir):
     """Finds a nixl wheel file in the specified cache directory."""
     # The repaired wheel will have a 'manylinux' tag, but this glob still works.
-    search_pattern = os.path.join(cache_dir, "nixl*.whl")
+    search_pattern = os.path.join(cache_dir, f"nixl*{NIXL_VERSION}*.whl")
     wheels = glob.glob(search_pattern)
     if wheels:
         # Sort to get the most recent/highest version if multiple exist
@@ -146,6 +162,10 @@ def build_and_install_prerequisites(args):
     print("\n[2/3] Building NIXL wheel from source...", flush=True)
     if not os.path.exists(NIXL_DIR):
         run_command(["git", "clone", NIXL_REPO_URL, NIXL_DIR])
+    else:
+        run_command(["git", "fetch", "--tags"], cwd=NIXL_DIR)
+    run_command(["git", "checkout", NIXL_VERSION], cwd=NIXL_DIR)
+    print(f"--> Checked out NIXL version: {NIXL_VERSION}", flush=True)
 
     build_env = os.environ.copy()
     build_env["PKG_CONFIG_PATH"] = os.path.join(ucx_install_path, "lib", "pkgconfig")
@@ -203,7 +223,14 @@ def build_and_install_prerequisites(args):
             {os.path.basename(newly_built_wheel)}. Now installing...",
         flush=True,
     )
-    install_command = [sys.executable, "-m", "pip", "install", newly_built_wheel]
+    install_command = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-deps",  # w/o "no-deps", it will install cuda-torch
+        newly_built_wheel,
+    ]
     if args.force_reinstall:
         install_command.insert(-1, "--force-reinstall")
 
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
new file mode 100644
index 0000000000000..8d35aa65738b2
--- /dev/null
+++ b/vllm/_aiter_ops.py
@@ -0,0 +1,942 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from collections.abc import Callable
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
+
+
+def is_aiter_found() -> bool:
+    from importlib.util import find_spec
+
+    return find_spec("aiter") is not None
+
+
+# `find_spec` is not torch.compile compatible.
+# In cases where aiter availability might have
+# been checked in forward passes that are torch compiled.
+# we keep this global outside to not cause torch compile breaks.
+IS_AITER_FOUND = is_aiter_found()
+
+
+def if_aiter_supported(func: Callable) -> Callable:
+    """Decorator that only executes the function if
+    ROCm AITER package is supported on gfx9 archs.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # checks the platform, device arch and aiter library existance.
+
+        from vllm.platforms.rocm import on_gfx9
+
+        if current_platform.is_rocm() and on_gfx9() and IS_AITER_FOUND:
+            return func(*args, **kwargs)
+        else:
+            # Return None or do nothing if not supported
+            return None
+
+    return wrapper
+
+
+def _rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(
+        hidden_states,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        expert_mask,
+        activation,
+        quant_type,
+        doweight_stage1,
+        w1_scale,
+        w2_scale,
+        a1_scale,
+        a2_scale,
+    )
+
+
+def _rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_asm_moe_tkw1_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+
+
+def _rocm_aiter_asm_moe_tkw1_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_topk_softmax_impl(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    from aiter import topk_softmax
+
+    topk_softmax(
+        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+    )
+
+
+def _rocm_aiter_topk_softmax_fake(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    pass
+
+
+def _rocm_aiter_biased_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    from aiter import biased_grouped_topk
+
+    biased_grouped_topk(
+        gating_output,
+        correction_bias,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_biased_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    is_softmax = scoring_func == "softmax"
+    from aiter import grouped_topk
+
+    grouped_topk(
+        gating_output,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        is_softmax,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_mla_decode_fwd_impl(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(
+        q,
+        kv_buffer.view(-1, 1, 1, q.shape[-1]),
+        o,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        max_seqlen_qo,
+        sm_scale=sm_scale,
+        logit_cap=logit_cap,
+    )
+
+
+def _rocm_aiter_mla_decode_fwd_fake(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    pass
+
+
+def _rocm_aiter_gemm_w8a8_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_CK
+
+    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+    # a to be [M, K]
+    # b to be [N, K]
+    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
+
+
+def _rocm_aiter_gemm_w8a8_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_gemm_w8a8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_blockscale
+
+    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+
+def _rocm_aiter_gemm_w8a8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_rms_norm_impl(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    from aiter import rms_norm
+
+    if x.dim() > 2:
+        x_original_shape = x.shape
+        x = x.reshape(-1, x_original_shape[-1])
+        x = rms_norm(x, weight, variance_epsilon)
+        return x.reshape(x_original_shape)
+
+    return rms_norm(x, weight, variance_epsilon)
+
+
+def _rocm_aiter_rms_norm_fake(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import rmsnorm2d_fwd_with_add
+
+    residual_out = torch.empty_like(residual)
+    output = torch.empty_like(x)
+    rmsnorm2d_fwd_with_add(
+        output,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return output, residual_out
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x), torch.empty_like(residual)
+
+
+# Global flag to ensure ops are registered only once
+_OPS_REGISTERED = False
+
+
+class rocm_aiter_ops:
+    _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
+    _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
+    _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
+    _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
+    _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
+    _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+    _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
+    _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+    _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+    _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+    _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
+    _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+
+    @classmethod
+    @if_aiter_supported
+    def is_enabled(cls) -> bool:
+        """Verifies device specs and availability of aiter main env variable."""
+        return cls._AITER_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._LINEAR_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_fp8_enaled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls.is_linear_enabled() and current_platform.is_fp8_fnuz()
+
+    @classmethod
+    @if_aiter_supported
+    def is_rmsnorm_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fused_moe_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._FMOE_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fusion_moe_shared_experts_enabled(cls) -> bool:
+        return cls.is_fused_moe_enabled() and cls._MOE_SHARED_EXPERTS_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mla_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MLA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mha_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MHA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_pa_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_unified_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fp8bmm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP8BMM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_asm_fp4_gemm_dynamic_quant_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_rotary_embed_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_ROTARY_EMBED
+
+    @staticmethod
+    @if_aiter_supported
+    def register_ops_once() -> None:
+        global _OPS_REGISTERED
+        if not _OPS_REGISTERED:
+            tags = (
+                tuple()
+                if is_torch_equal_or_newer("2.7.0")
+                else (torch.Tag.needs_fixed_stride_order,)
+            )
+
+            # register all the custom ops here
+            direct_register_custom_op(
+                op_name="rocm_aiter_asm_moe_tkw1",
+                op_func=_rocm_aiter_asm_moe_tkw1_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_asm_moe_tkw1_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_moe",
+                op_func=_rocm_aiter_fused_moe_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_fused_moe_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_topk_softmax",
+                op_func=_rocm_aiter_topk_softmax_impl,
+                mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+                fake_impl=_rocm_aiter_topk_softmax_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_biased_grouped_topk",
+                op_func=_rocm_aiter_biased_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_biased_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_grouped_topk",
+                op_func=_rocm_aiter_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_mla_decode_fwd",
+                op_func=_rocm_aiter_mla_decode_fwd_impl,
+                mutates_args=["o"],
+                fake_impl=_rocm_aiter_mla_decode_fwd_fake,
+                tags=tags,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_w8a8",
+                op_func=_rocm_aiter_gemm_w8a8_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_w8a8_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_w8a8_blockscale",
+                op_func=_rocm_aiter_gemm_w8a8_blockscale_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_w8a8_blockscale_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rms_norm",
+                op_func=_rocm_aiter_rms_norm_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rms_norm_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
+                op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            _OPS_REGISTERED = True
+
+    @staticmethod
+    def rms_norm2d_with_add(
+        x: torch.Tensor,
+        residual: torch.Tensor,
+        weight: torch.Tensor,
+        variance_epsilon: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add(
+            x, residual, weight, variance_epsilon
+        )
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
+    @staticmethod
+    def gemm_w8a8(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        bias: torch.Tensor | None = None,
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_w8a8(A, B, As, Bs, bias, output_dtype)
+
+    @staticmethod
+    def gemm_w8a8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale(
+            A, B, As, Bs, output_dtype
+        )
+
+    @staticmethod
+    def fused_moe(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+        quant_method: int = 0,
+        doweight_stage1: bool = False,
+        w1_scale: torch.Tensor | None = None,
+        w2_scale: torch.Tensor | None = None,
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            expert_mask,
+            activation_method,
+            quant_method,
+            doweight_stage1,
+            w1_scale,
+            w2_scale,
+            a1_scale,
+            a2_scale,
+        )
+
+    @staticmethod
+    def asm_moe_tkw1(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: torch.Tensor | None = None,
+        fc2_scale: torch.Tensor | None = None,
+        fc1_smooth_scale: torch.Tensor | None = None,
+        fc2_smooth_scale: torch.Tensor | None = None,
+        a16: bool = False,
+        per_tensor_quant_scale: torch.Tensor | None = None,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale,
+            fc2_scale,
+            fc1_smooth_scale,
+            fc2_smooth_scale,
+            a16,
+            per_tensor_quant_scale,
+            expert_mask,
+            activation_method,
+        )
+
+    @staticmethod
+    def topk_softmax(
+        topk_weights: torch.Tensor,
+        topk_indices: torch.Tensor,
+        token_expert_indices: torch.Tensor,
+        gating_output: torch.Tensor,
+        renormalize: bool,
+    ) -> tuple[torch.Tensor, ...]:
+        torch.ops.vllm.rocm_aiter_topk_softmax(
+            topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+        )
+        return topk_weights, topk_indices
+
+    @staticmethod
+    def biased_grouped_topk(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def grouped_topk(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            scoring_func,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def mla_decode_fwd(
+        q: torch.Tensor,
+        kv_buffer: torch.Tensor,
+        o: torch.Tensor,
+        sm_scale: float,
+        qo_indptr: torch.Tensor,
+        max_seqlen_qo: int,
+        kv_indptr: torch.Tensor | None = None,
+        kv_indices: torch.Tensor | None = None,
+        kv_last_page_lens: torch.Tensor | None = None,
+        logit_cap: float = 0.0,
+    ):
+        torch.ops.vllm.rocm_aiter_mla_decode_fwd(
+            q,
+            kv_buffer.view(-1, 1, 1, q.shape[-1]),
+            o,
+            qo_indptr,
+            max_seqlen_qo,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            sm_scale=sm_scale,
+            logit_cap=logit_cap,
+        )
+
+    @staticmethod
+    def triton_fp4_gemm_dynamic_qaunt(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+        x_scales: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+        if x_scales is None:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+        else:
+            x_q = x
+            x_s = x_scales
+
+        y = torch.empty(
+            x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype
+        )
+
+        gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
+        return y
+
+    @staticmethod
+    def triton_rotary_embed(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos_sin_cache: torch.Tensor,
+        head_size: int,
+        rotary_dim: int,
+        is_neox_style: bool,
+    ):
+        from aiter.ops.triton.rope import rope_cached_thd_positions_2c_fwd_inplace
+
+        num_tokens = positions.numel()
+        cos, sin = cos_sin_cache.chunk(2, dim=-1)
+        query_shape = query.shape
+        key_shape = key.shape
+        rotate_style = 0 if is_neox_style else 1
+
+        query = query.view(num_tokens, -1, head_size)
+        key = key.view(num_tokens, -1, head_size)
+        query_ = query[..., :rotary_dim]
+        key_ = key[..., :rotary_dim]
+        positions = positions.view(*query.shape[:1])
+        rope_cached_thd_positions_2c_fwd_inplace(
+            positions,
+            sin,
+            cos,
+            query_,
+            key_,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            is_nope_first=False,
+        )
+        query = query.view(query_shape)
+        key = key.view(key_shape)
+
+    @staticmethod
+    def triton_fp8_bmm(
+        X: torch.Tensor,
+        WQ: torch.Tensor,
+        w_scale: torch.Tensor,
+        group_size: int = 128,
+        bias: torch.Tensor | None = None,
+        dtype: torch.dtype | None = torch.bfloat16,
+        splitK: int | None = None,
+        YQ: torch.Tensor | None = None,
+        transpose_bm: bool | None = False,
+        config: dict | None = None,
+    ) -> torch.Tensor:
+        # ruff: noqa: E501 # isort: skip
+        from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (
+            batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
+        )
+
+        return aiter_triton_fp8_bmm(
+            X,
+            WQ,
+            w_scale,
+            group_size=group_size,
+            bias=bias,
+            dtype=dtype,
+            splitK=splitK,
+            YQ=YQ,
+            transpose_bm=transpose_bm,
+            config=config,
+        )
+
+    @staticmethod
+    def triton_gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
+
+        return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+    @staticmethod
+    def per_1x128_fp8_quant(
+        input_2d: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        """Only applies quantization method for fp8 data type only."""
+        from aiter import QuantType, dtypes, get_hip_quant
+
+        aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+        return aiter_per1x128_quant(input_2d.contiguous(), quant_dtype=dtypes.fp8)
+
+    @staticmethod
+    def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (1024, 8192),
+            (2112, 7168),
+            (3072, 1536),
+            (32768, 8192),
+            (4096, 7168),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2048),
+            (7168, 256),
+            (8192, 1024),
+            (8192, 32768),
+        ]
+
+    @staticmethod
+    def shuffle_weight(
+        self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> torch.Tensor:
+        from aiter.ops.shuffle import shuffle_weight
+
+        return shuffle_weight(tensor, layout=layout)
+
+    @staticmethod
+    def shuffle_weights(
+        *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Applies shuffle_weight function from AITER to each
+        input tensor and returns them.
+
+        Rearranges (shuffles) the input tensor/s
+        into a specified block layout for optimized computation.
+
+        Args:
+            *tensors: Variable number of torch.Tensor objects.
+            layout: A pair of integers specifying the block sizes used to divide
+                the tensors during shuffling. Default is (16, 16).
+
+        Returns:
+        A Tuple of shuffled tensors.
+        """
+        from aiter.ops.shuffle import shuffle_weight
+
+        return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
+
+
+if IS_AITER_FOUND:
+    rocm_aiter_ops.register_ops_once()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9110b0573fc92..36aab503dee70 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -836,7 +836,11 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
 
 
 def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+    try:
+        return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+    except AttributeError:
+        # Return False on non-CUDA platforms where it is not available
+        return False
 
 
 def cutlass_sparse_compress(a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
@@ -1381,7 +1385,7 @@ def scaled_fp4_quant(
     rounded_m = round_up(m, 128)
     scale_n = n // block_size
     rounded_n = round_up(scale_n, 4)
-    output_scale = torch.zeros(
+    output_scale = torch.empty(
         (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
     )
 
@@ -1719,6 +1723,10 @@ def selective_scan_fwd(
     has_initial_state: torch.Tensor | None,
     ssm_states: torch.Tensor,
     pad_slot_id: int,
+    block_size: int = 1024,
+    block_idx_first_scheduled_token: torch.Tensor | None = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
 ):
     torch.ops._C.selective_scan_fwd(
         u,
@@ -1735,6 +1743,10 @@ def selective_scan_fwd(
         has_initial_state,
         ssm_states,
         pad_slot_id,
+        block_size,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
     )
 
 
@@ -1815,6 +1827,8 @@ def moe_lora_align_block_size(
     sorted_token_ids: torch.Tensor,
     experts_ids: torch.Tensor,
     num_tokens_post_pad: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    lora_ids: torch.Tensor,
 ) -> None:
     torch.ops._moe_C.moe_lora_align_block_size(
         topk_ids,
@@ -1827,6 +1841,8 @@ def moe_lora_align_block_size(
         sorted_token_ids,
         experts_ids,
         num_tokens_post_pad,
+        adapter_enabled,
+        lora_ids,
     )
 
 
@@ -1882,25 +1898,40 @@ def topk_softmax(
 
 def grouped_topk(
     scores: torch.Tensor,
-    scores_with_bias: torch.Tensor,
     num_expert_group: int,
     topk_group: int,
     topk: int,
     renormalize: bool,
     routed_scaling_factor: float,
+    bias: torch.Tensor,
+    scoring_func: int = 0,
 ):
+    """
+    Perform grouped top-k routing for mixture of experts.
+
+    Args:
+        scores: Raw inputs (logits if scoring_func=1, scores if scoring_func=0)
+        num_expert_group: Number of expert groups
+        topk_group: Number of groups to select
+        topk: Number of experts to select per token
+        renormalize: Whether to renormalize the output weights
+        routed_scaling_factor: Scaling factor for routing weights
+        bias: Bias tensor (e_score_correction_bias). Always fused in kernel.
+        scoring_func: 0=none (no activation), 1=sigmoid
+    """
     if not current_platform.is_cuda():
         raise NotImplementedError(
             "The fused grouped_topk kernel is only available on CUDA platforms"
         )
     return torch.ops._moe_C.grouped_topk(
         scores,
-        scores_with_bias,
         num_expert_group,
         topk_group,
         topk,
         renormalize,
         routed_scaling_factor,
+        bias,
+        scoring_func,
     )
 
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index e9c6a278a9411..b54eaf4e2872d 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -51,19 +51,10 @@ class AttentionBackend(ABC):
     def get_impl_cls() -> type["AttentionImpl"]:
         raise NotImplementedError
 
-    @staticmethod
-    @abstractmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        raise NotImplementedError
-
     @classmethod
     def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
         return cls.get_impl_cls().get_supported_kernel_block_size()
 
-    @classmethod
-    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
-        return cls.get_metadata_cls()(*args, **kwargs)
-
     @staticmethod
     @abstractmethod
     def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 17e025155a431..96272981692c0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -745,6 +745,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         k_pe: torch.Tensor,
         output_shape: torch.Size | None = None,
     ) -> torch.Tensor:
+        if self.calculate_kv_scales:
+            torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name)
+
         if self.use_direct_call:
             forward_context: ForwardContext = get_forward_context()
             attn_metadata = forward_context.attn_metadata
@@ -752,12 +755,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
                 attn_metadata = attn_metadata[self.layer_name]
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
 
-            # Mirror Attention.forward scale calculation path
-            if self.calculate_kv_scales and getattr(
-                attn_metadata, "enable_kv_scales_calculation", False
-            ):
-                self.calc_kv_scales(q, kv_c_normed, k_pe)
-
             if self.attn_backend.accept_output_buffer:
                 output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
                 self.impl.forward(
@@ -786,14 +783,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
                 )
                 return output
             else:
-                # We can still access forward context to check calculation flag
-                if self.calculate_kv_scales:
-                    forward_context = get_forward_context()
-                    attn_metadata = forward_context.attn_metadata
-                    if isinstance(attn_metadata, dict):
-                        attn_metadata = attn_metadata[self.layer_name]
-                    if getattr(attn_metadata, "enable_kv_scales_calculation", False):
-                        self.calc_kv_scales(q, kv_c_normed, k_pe)
                 return torch.ops.vllm.unified_mla_attention(
                     q,
                     kv_c_normed,
@@ -881,17 +870,13 @@ def maybe_calc_kv_scales(
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
+    self = forward_context.no_compile_layers[layer_name]
 
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-
-    if attn_metadata is None or not getattr(
-        attn_metadata, "enable_kv_scales_calculation", False
-    ):
+    # Only calculate if the layer's calculate_kv_scales flag is True
+    # This flag gets set to False after the first forward pass
+    if not self.calculate_kv_scales:
         return
 
-    self = forward_context.no_compile_layers[layer_name]
     self.calc_kv_scales(query, key, value)
 
 
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 18422404d08f9..5532ce80d7f15 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -5,7 +5,6 @@ from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
@@ -78,17 +77,12 @@ class ChunkedLocalAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
-
-            attn_backend = create_chunked_local_attention_backend(
-                underlying_attn_backend, attention_chunk_size, block_size
-            )
-        else:
-            # in v0 the local attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_chunked_local_attention_backend(
+            underlying_attn_backend, attention_chunk_size, block_size
+        )
 
         super().__init__(
             num_heads=num_heads,
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 4b89c28f0ca6a..5b44c7e3e7ec8 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -6,7 +6,6 @@ from copy import copy
 import numpy as np
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
@@ -150,15 +149,10 @@ class CrossAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
-
-            attn_backend = create_cross_attention_backend(underlying_attn_backend)
-        else:
-            # in v0 cross attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_DECODER, (
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index 8d2a046757feb..4929bbf5efc73 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -5,7 +5,6 @@ from copy import copy
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
@@ -74,17 +73,11 @@ class EncoderOnlyAttention(Attention):
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
 
-            attn_backend = create_encoder_only_attention_backend(
-                underlying_attn_backend
-            )
-        else:
-            # in v0 encoder only attention is handled inside the backends
-            attn_backend = None
+        attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_ONLY, (
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index b6b7ecd2552a7..2cbb5c91cc3b3 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -53,6 +53,7 @@ def _correct_attn_cp_out_kernel(
     lse = tl.load(lses_ptr + lse_offsets)
     lse = tl.where((lse != lse) | (lse == float("inf")), -float("inf"), lse)
     lse_max = tl.max(lse, axis=0)
+    lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
     lse -= lse_max
     lse_exp = tl.exp(lse)
     lse_acc = tl.sum(lse_exp, axis=0)
@@ -194,7 +195,6 @@ def cp_lse_ag_out_rs(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
-    assert out.is_contiguous()
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
deleted file mode 100644
index 6308f63cc4e70..0000000000000
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
-
-
-def get_aiter_mla_metadata(
-    max_batch_size: int, block_size: int, max_block_per_batch: int, device: torch.device
-) -> tuple[torch.Tensor, ...]:
-    paged_kv_indices = torch.zeros(
-        max_batch_size * max_block_per_batch, dtype=torch.int32, device=device
-    )
-    paged_kv_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int32, device=device)
-    paged_kv_last_page_lens = torch.full(
-        (max_batch_size,), block_size, dtype=torch.int32
-    )
-    qo_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int, device=device)
-    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens, qo_indptr
-
-
-def aiter_mla_decode_fwd(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    sm_scale: float,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    logit_cap: float = 0.0,
-):
-    torch.ops.vllm.rocm_aiter_mla_decode_fwd(
-        q,
-        kv_buffer.view(-1, 1, 1, q.shape[-1]),
-        o,
-        qo_indptr,
-        max_seqlen_qo,
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-    )
-
-
-def mla_decode_fwd_impl(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    from aiter.mla import mla_decode_fwd
-
-    mla_decode_fwd(
-        q,
-        kv_buffer.view(-1, 1, 1, q.shape[-1]),
-        o,
-        qo_indptr,
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        max_seqlen_qo,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-    )
-
-
-def mla_decode_fwd_fake(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    pass
-
-
-if current_platform.is_rocm():
-    if is_torch_equal_or_newer("2.7.0"):
-        tags = ()
-    else:
-        tags = ((torch.Tag.needs_fixed_stride_order,),)
-    direct_register_custom_op(
-        op_name="rocm_aiter_mla_decode_fwd",
-        op_func=mla_decode_fwd_impl,
-        mutates_args=["o"],
-        fake_impl=mla_decode_fwd_fake,
-        tags=tags,
-    )
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 6cefe74416685..06a9f7cd82266 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -14,6 +14,7 @@ To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0)
 
 import einops
 import torch
+import torch.nn.functional as F
 
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -123,3 +124,55 @@ def vit_flash_attn_wrapper(
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
         q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
     )
+
+
+# TODO: Once we have a torch 2.10, we can use tensor slices
+# so we won't need to wrap this in custom ops
+def torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    outputs = []
+    for i in range(1, len(cu_seqlens)):
+        start_idx = cu_seqlens[i - 1]
+        end_idx = cu_seqlens[i]
+        q_i = q[:, start_idx:end_idx]
+        k_i = k[:, start_idx:end_idx]
+        v_i = v[:, start_idx:end_idx]
+        q_i, k_i, v_i = (
+            einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+        )
+        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        outputs.append(output_i)
+    context_layer = torch.cat(outputs, dim=1)
+    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+    return context_layer
+
+
+def torch_sdpa_wrapper_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="torch_sdpa_wrapper",
+    op_func=torch_sdpa_wrapper,
+    fake_impl=torch_sdpa_wrapper_fake,
+)
+
+
+def vit_torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 9890d8d80cba2..9c26a8d40edaf 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -134,16 +134,11 @@ def get_attn_backend(
     use_sparse: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,
         kv_cache_dtype=kv_cache_dtype,
         block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
@@ -156,7 +151,6 @@ def _cached_get_attn_backend(
     dtype: torch.dtype,
     kv_cache_dtype: str | None,
     block_size: int,
-    use_v1: bool = False,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
@@ -199,7 +193,7 @@ def _cached_get_attn_backend(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
+        True,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index b92b822c1d19f..adb9b08a65735 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -80,6 +80,13 @@ def flash_attn_supports_fp8() -> bool:
     )
 
 
+def flash_attn_supports_sinks() -> bool:
+    if current_platform.is_xpu():
+        return True
+    else:
+        return get_flash_attn_version() == 3
+
+
 def flash_attn_supports_mla():
     from vllm.platforms import current_platform
 
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index b1aa8530eb026..5411ecbb27b27 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -30,7 +30,6 @@ from io import BytesIO
 from tempfile import NamedTemporaryFile
 from typing import Any, cast
 
-import cv2
 import numpy as np
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
@@ -850,6 +849,8 @@ class RandomMultiModalDataset(RandomDataset):
         Creates a video with random pixel values, encodes it to MP4 format,
         and returns the content as bytes.
         """
+        import cv2
+
         random_pixels = self._rng.integers(
             0,
             256,
@@ -1710,6 +1711,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
         ):
             dataset_class = MTBenchDataset
             args.hf_split = "train"
+        elif (
+            args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            dataset_class = MultiModalConversationDataset
         elif (
             args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
             or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
@@ -2271,11 +2277,70 @@ class HuggingFaceDataset(BenchmarkDataset):
 
 
 class ConversationDataset(HuggingFaceDataset):
-    """Dataset for conversation data with multimodal support."""
+    """Dataset for text-only conversation data."""
+
+    SUPPORTED_DATASET_PATHS = {
+        "Aeala/ShareGPT_Vicuna_unfiltered",
+    }
+    IS_MULTIMODAL = False
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        ind = 0
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
+                continue
+            mm_content = process_image(item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
+                )
+            )
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
+        return sampled_requests
+
+
+class MultiModalConversationDataset(HuggingFaceDataset):
+    """Dataset for multimodal conversation data."""
 
     SUPPORTED_DATASET_PATHS = {
         "lmms-lab/LLaVA-OneVision-Data",
-        "Aeala/ShareGPT_Vicuna_unfiltered",
     }
     IS_MULTIMODAL = True
 
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 4b15d8e62913c..0e9b0fbe2c028 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -19,7 +19,6 @@ On the client side, run:
 import argparse
 import asyncio
 import contextlib
-import gc
 import importlib.util
 import json
 import os
@@ -49,6 +48,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.gc_utils import freeze_gc_heap
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -189,9 +189,16 @@ async def get_request(
             total_requests,
             request_rate,
         )
+        assert current_request_rate > 0.0, (
+            f"Obtained non-positive request rate {current_request_rate}."
+        )
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
+        elif burstiness == float("inf"):
+            # when burstiness tends to infinity, the delay time becomes constant
+            # and tends to the inverse of the request rate
+            delay_ts.append(1.0 / current_request_rate)
         else:
             theta = 1.0 / (current_request_rate * burstiness)
 
@@ -1352,6 +1359,14 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             "'--dataset-path' if required."
         )
 
+    # when using random datasets, default to ignoring EOS
+    # so generation runs to the requested length
+    if (
+        args.dataset_name in ("random", "random-mm")
+        and args.backend in OPENAI_COMPATIBLE_BACKENDS
+    ):
+        args.ignore_eos = True
+
     # Load the dataset.
     input_requests = get_samples(args, tokenizer)
     goodput_config_dict = check_goodput_args(args)
@@ -1399,8 +1414,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
 
     # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
+    freeze_gc_heap()
 
     benchmark_result = await benchmark(
         task_type=task_type,
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 78c0f8bbbda7a..23b5faa1b2c32 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -21,6 +21,7 @@ from vllm.benchmarks.datasets import (
     BurstGPTDataset,
     ConversationDataset,
     InstructCoderDataset,
+    MultiModalConversationDataset,
     PrefixRepetitionRandomDataset,
     RandomDataset,
     SampleRequest,
@@ -367,6 +368,11 @@ def get_requests(args, tokenizer):
         elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = InstructCoderDataset
             common_kwargs["dataset_split"] = "train"
+        elif args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = MultiModalConversationDataset
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = ConversationDataset
             common_kwargs["dataset_subset"] = args.hf_subset
@@ -456,6 +462,7 @@ def validate_args(args):
     elif args.dataset_name == "hf":
         if args.dataset_path in (
             VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
             | ConversationDataset.SUPPORTED_DATASET_PATHS
         ):
             assert args.backend == "vllm-chat", (
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 53fd5e74dc0a8..be69075f94f09 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -19,7 +19,7 @@ import vllm.envs as envs
 from vllm.compilation.inductor_pass import pass_context
 from vllm.compilation.partition_rules import (
     inductor_partition_rule_context,
-    resolve_defined_ops,
+    should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
@@ -33,6 +33,7 @@ from .compiler_interface import (
     EagerAdaptor,
     InductorAdaptor,
     InductorStandaloneAdaptor,
+    is_compile_cache_enabled,
 )
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
@@ -51,7 +52,9 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
             and hasattr(torch._inductor, "standalone_compile")
         ):
             logger.debug("Using InductorStandaloneAdaptor")
-            return InductorStandaloneAdaptor()
+            return InductorStandaloneAdaptor(
+                compilation_config.compile_cache_save_format
+            )
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
@@ -95,10 +98,9 @@ class CompilerManager:
         compilation (e.g. partition rules, pass context)."""
         with pass_context(runtime_shape):
             if self.compilation_config.use_inductor_graph_partition:
-                inductor_partition_ops = resolve_defined_ops(
+                with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
-                )
-                with inductor_partition_rule_context(inductor_partition_ops):
+                ):
                     yield
             else:
                 yield
@@ -238,7 +240,7 @@ class CompilerManager:
         assert compiled_graph is not None, "Failed to compile the graph"
 
         # store the artifact in the cache
-        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
+        if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
             self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
@@ -301,7 +303,7 @@ class SplitItem:
 
 
 def split_graph(
-    graph: fx.GraphModule, resolved_ops: list[torch._ops.OpOverload]
+    graph: fx.GraphModule, splitting_ops: list[str]
 ) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
@@ -310,12 +312,8 @@ def split_graph(
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
-        # Match node.target against resolved_ops
-        # node.target can be OpOverloadPacket, need to check .default
-        if node.op == "call_function" and (
-            node.target in resolved_ops
-            or (hasattr(node.target, "default") and node.target.default in resolved_ops)
-        ):
+
+        if should_split(node, splitting_ops):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id
             split_op_graphs.append(subgraph_id)
@@ -610,7 +608,9 @@ class VllmBackend:
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
-        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
+        disable_cache = not is_compile_cache_enabled(
+            self.compilation_config.inductor_compile_config
+        )
 
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
@@ -649,8 +649,7 @@ class VllmBackend:
         else:
             fx_split_ops = self.compilation_config.splitting_ops or []
 
-        resolved_split_ops = resolve_defined_ops(fx_split_ops)
-        self.split_gm, self.piecewise_graphs = split_graph(graph, resolved_split_ops)
+        self.split_gm, self.piecewise_graphs = split_graph(graph, fx_split_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 7294ddce64ba1..69d4606d73ebd 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -9,7 +9,6 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -450,34 +449,41 @@ class AsyncTPPass(VllmPatternMatcherPass):
         logger.debug("Replaced %s patterns", self.matched_count)
 
 
+# Max size of the input tensor per world size per device capability
+# to use flashinfer fused allreduce
+FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = {
+    90: {
+        2: 64,  # 64MB
+        4: 2,  # 2MB
+        8: 0.5,  # 0.5MB
+    },
+    100: {
+        2: 64,  # 64MB
+        4: 32,  # 32MB
+        8: 1,  # 1MB
+    },
+}
+
+# Max size of the input tensor per world size per device capability
+# to use flashinfer one shot fused allreduce
+# OneShot max size is at most 64MB / world size (FlashInfer restriction)
+_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = {
+    90: {
+        2: 32,  # 32MB
+        4: 2,  # 2MB
+        8: 0.5,  # 0.5MB
+    },
+    100: {
+        2: 32,  # 32MB
+        4: 4,  # 4MB
+        8: 1,  # 1MB
+    },
+}
+
+
 if flashinfer_comm is not None:
     _FI_WORKSPACE_TENSOR = None
-
     MiB = 1024 * 1024
-    # Max size of the input tensor per world size
-    # to use flashinfer fused allreduce
-    _FI_MAX_SIZES = {
-        2: 64 * MiB,  # 64MB
-        4: MiB,  # 1MB
-        6: MiB // 2,  # 512KB
-        8: MiB // 2,  # 512KB
-    }
-
-    try:
-        _FI_MAX_SIZES.update(
-            {
-                int(k): int(float(v) * MiB)
-                for k, v in envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items()
-            }
-        )
-    except Exception as e:
-        raise ValueError(
-            "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: " + str(e)
-        ) from e
-
-    # opt for a more conservative default value
-    # when world size is not in _FI_MAX_SIZES
-    _DEFAULT_FI_MAX_SIZE = MiB // 2
 
     def call_trtllm_fused_allreduce_norm(
         allreduce_in: torch.Tensor,
@@ -491,7 +497,6 @@ if flashinfer_comm is not None:
         fp32_acc: bool,
         max_token_num: int,
         pattern_code: int,
-        fuse_rms_quant: bool,
         norm_out: torch.Tensor | None = None,
         quant_out: torch.Tensor | None = None,
         scale_out: torch.Tensor | None = None,
@@ -500,12 +505,20 @@ if flashinfer_comm is not None:
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
-        max_fusion_size = max_token_num * hidden_size * element_size
-        use_flashinfer = current_tensor_size <= min(
-            _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE),
-            max_fusion_size,
-        )
-        if use_flashinfer:
+
+        if num_tokens <= max_token_num:
+            device_capability = current_platform.get_device_capability().to_int()
+            # Get one shot input size limit for the current world size
+            # for the current device capability
+            max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
+                device_capability, {}
+            ).get(world_size, None)
+            # Use one shot if no max size for one shot is specified
+            use_oneshot = (
+                max_one_shot_size_mb is None
+                or current_tensor_size <= max_one_shot_size_mb * MiB
+            )
+
             assert _FI_WORKSPACE_TENSOR is not None, (
                 "Flashinfer must be enabled when using flashinfer"
             )
@@ -532,7 +545,7 @@ if flashinfer_comm is not None:
                 hidden_dim=allreduce_in.shape[-1],
                 workspace_ptrs=_FI_WORKSPACE_TENSOR,
                 launch_with_pdl=launch_with_pdl,
-                use_oneshot=True,
+                use_oneshot=use_oneshot,
                 trigger_completion_at_end=trigger_completion_at_end,
                 fp32_acc=fp32_acc,
                 pattern_code=pattern_code,
@@ -545,7 +558,7 @@ if flashinfer_comm is not None:
             )
         else:
             allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if scale_factor is not None and scale_out is None and fuse_rms_quant:
+            if scale_factor is not None and scale_out is None:
                 # Do fused rms norm static fp8 quant fused op
                 if norm_out is None:
                     torch.ops._C.fused_add_rms_norm_static_fp8_quant(
@@ -568,15 +581,10 @@ if flashinfer_comm is not None:
                     norm_out = allreduce_out
                 else:
                     torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps)
-                if scale_factor is not None:
-                    if scale_out is not None:
-                        torch.ops._C.scaled_fp4_quant(
-                            quant_out, norm_out, scale_out, scale_factor
-                        )
-                    else:
-                        torch.ops._C.static_scaled_fp8_quant(
-                            quant_out, norm_out, scale_factor
-                        )
+                if scale_factor is not None and scale_out is not None:
+                    torch.ops._C.scaled_fp4_quant(
+                        quant_out, norm_out, scale_out, scale_factor
+                    )
             if scale_factor is None or norm_out is not None:
                 # we need to return allreduce output
                 # in cases of non quant fused AR + RMS norm
@@ -595,7 +603,6 @@ if flashinfer_comm is not None:
         fp32_acc: bool,
         max_token_num: int,
         pattern_code: int,
-        fuse_rms_quant: bool,
         norm_out: torch.Tensor | None = None,
         quant_out: torch.Tensor | None = None,
         scale_out: torch.Tensor | None = None,
@@ -629,7 +636,6 @@ class FlashInferFusedAllReduceParams:
         world_size: int,
         use_fp32_lamport: bool = False,
         max_token_num: int = 1024,
-        fuse_rms_quant: bool = False,
     ):
         self.rank = rank
         self.world_size = world_size
@@ -637,9 +643,7 @@ class FlashInferFusedAllReduceParams:
         self.trigger_completion_at_end = True
         self.launch_with_pdl = True
         self.fp32_acc = True
-        self.use_oneshot = False
         self.max_token_num = max_token_num
-        self.fuse_rms_quant = fuse_rms_quant
 
     def get_trtllm_fused_allreduce_kwargs(self):
         return {
@@ -649,7 +653,6 @@ class FlashInferFusedAllReduceParams:
             "trigger_completion_at_end": self.trigger_completion_at_end,
             "fp32_acc": self.fp32_acc,
             "max_token_num": self.max_token_num,
-            "fuse_rms_quant": self.fuse_rms_quant,
         }
 
 
@@ -1119,23 +1122,35 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
                 "skipping allreduce fusion pass"
             )
             return
-        # Check if the world size is supported
-        if self.tp_size not in _FI_MAX_SIZES:
+        max_size = config.compilation_config.pass_config.flashinfer_max_size(
+            self.tp_size
+        )
+        if max_size is None:
+            # Flashinfer doesn't support current world size
             logger.warning(
                 "Flashinfer allreduce fusion is not supported for world size %s",
                 self.tp_size,
             )
             return
-        max_num_token = min(
-            _FI_MAX_SIZES.get(self.tp_size, _DEFAULT_FI_MAX_SIZE)
-            // (self.hidden_dim * self.tp_size * (4 if use_fp32_lamport else 2)),
-            config.compilation_config.pass_config.fi_allreduce_fusion_max_token_num,
+        element_size = 4 if use_fp32_lamport else 2
+        self.max_token_num = max_size // (self.hidden_dim * element_size)
+        # take the min to save workspace size and we'll never use more
+        # than max_num_batched_tokens anyways
+        self.max_token_num = min(
+            self.max_token_num, config.scheduler_config.max_num_batched_tokens
         )
+        logger.debug_once(
+            f"Flashinfer max size: {max_size // (1024 * 1024)} MB,"
+            "Maximal number of tokens used by "
+            f"Flashinfer Allreduce Fusion: {self.max_token_num}",
+            scope="global",
+        )
+
         self.ipc_handles, workspace_tensor = (
             flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
                 tp_rank=rank,
                 tp_size=self.tp_size,
-                max_token_num=max_num_token,
+                max_token_num=self.max_token_num,
                 hidden_dim=self.hidden_dim,
                 group=self.group,
                 use_fp32_lamport=use_fp32_lamport,
@@ -1148,10 +1163,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass):
             rank=rank,
             world_size=self.tp_size,
             use_fp32_lamport=use_fp32_lamport,
-            max_token_num=max_num_token,
-            # fuse rms norm static fp8 quant fused op
-            # in fallback path, when we don't use flashinfer
-            fuse_rms_quant=config.compilation_config.pass_config.enable_fusion,
+            max_token_num=self.max_token_num,
         )
 
         self.register_patterns()
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 0a3f0769db941..b0cdb08884a3b 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -6,7 +6,7 @@ import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any
+from typing import Any, Literal
 from unittest.mock import patch
 
 import torch
@@ -163,6 +163,23 @@ def get_inductor_factors() -> list[Any]:
     return factors
 
 
+def is_compile_cache_enabled(
+    vllm_additional_inductor_config: dict[str, Any],
+) -> bool:
+    vllm_inductor_config_disable_cache = vllm_additional_inductor_config.get(
+        "force_disable_caches", False
+    )
+
+    # TODO(gmagogsfm): Replace torch._inductor.config.force_disable_caches
+    # with torch.compiler.config.force_disable_caches when minimum PyTorch
+    # version reaches 2.10
+    return (
+        not envs.VLLM_DISABLE_COMPILE_CACHE
+        and not torch._inductor.config.force_disable_caches
+        and not vllm_inductor_config_disable_cache
+    )
+
+
 class InductorStandaloneAdaptor(CompilerInterface):
     """
     The adaptor for the Inductor compiler.
@@ -175,6 +192,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     name = "inductor_standalone"
 
+    def __init__(self, save_format: Literal["binary", "unpacked"]):
+        self.save_format = save_format
+
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
         hash_str = hashlib.md5(
@@ -219,8 +239,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
-            compiled_graph.save(path=path, format="unpacked")
+
+        if is_compile_cache_enabled(compiler_config):
+            compiled_graph.save(path=path, format=self.save_format)
             compilation_counter.num_compiled_artifacts_saved += 1
         return compiled_graph, (key, path)
 
@@ -237,7 +258,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         assert isinstance(handle[1], str)
         path = handle[1]
         inductor_compiled_graph = torch._inductor.CompiledArtifact.load(
-            path=path, format="unpacked"
+            path=path, format=self.save_format
         )
         from torch._inductor.compile_fx import graph_returns_tuple
 
@@ -469,10 +490,8 @@ class InductorAdaptor(CompilerInterface):
                 config_patches=current_config,
             )
 
-        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
-        # compilation cache. So turn off the checks if we disable the
-        # compilation cache.
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+        # Turn off the checks if we disable the compilation cache.
+        if is_compile_cache_enabled(compiler_config):
             if hash_str is None:
                 raise RuntimeError(
                     "vLLM failed to compile the model. The most "
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index cea4f9a816377..08bd27e809526 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -2,101 +2,71 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
-import logging
-from typing import TYPE_CHECKING
 
-from torch._library.utils import lookup_op
+import torch
 
 from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    import torch
-
 logger = init_logger(__name__)
 
 
-def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]:
-    """Resolve operator names to OpOverload objects.
-
-    Skips operators that fail to resolve (e.g., operators not registered or
-    model-specific operators not present in the current model).
-
-    Note: Users should inspect the operator graph before lowering and ensure
-    the specified operators are present in the final graph. Built-in PyTorch
-    operators (aten::*, torch::*) may be decomposed, fused, or transformed
-    during Inductor's compilation passes, so use them with caution.
-
-    Args:
-        op_names: List of operator names in PyTorch format
-            (e.g., "vllm::unified_attention")
-
-    Returns:
-        List of successfully resolved operator overloads
+def should_split(node: torch.fx.Node, splitting_ops: list[str]) -> bool:
+    """
+    Check if a node should be split for dynamo graph partition.
+    It operates on dynamo graph, so the node.target can be anything.
+    We need to check and split only on OpOverload and OpOverloadPacket.
     """
-    resolved = []
-    for op_name in op_names:
-        try:
-            resolved.append(lookup_op(op_name))
-        except Exception:
-            # Skip operators that don't exist (e.g., model-specific ops)
-            # Do not warn for attention ops, warn for others
-            # (most likely manually specified)
-            from vllm.config import CompilationConfig
 
-            logger.log(
-                logging.DEBUG
-                if op_name in CompilationConfig._attention_ops
-                else logging.WARNING,
-                "Failed to resolve operator for CUDAGraph partition: %s",
-                op_name,
-            )
-            continue
+    if node.op != "call_function":
+        return False
 
-    return resolved
+    target = node.target
+
+    if isinstance(target, torch._ops.OpOverloadPacket):
+        # Example: "aten::add"
+        return target._qualified_op_name in splitting_ops
+
+    if isinstance(target, torch._ops.OpOverload):
+        # Example: "aten::add"
+        packet_name = target.name()
+
+        # Example: "aten::add.default"
+        op_overload_name = f"{packet_name}.{target._overloadname}"
+        return op_overload_name in splitting_ops or packet_name in splitting_ops
+
+    return False
 
 
 @contextlib.contextmanager
-def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]):
+def inductor_partition_rule_context(splitting_ops: list[str]):
     """Context manager to temporarily register Inductor partition rules.
 
     Registers custom partition rules for specified operators, forcing the
     Inductor scheduler to partition the graph at these operators. The rules
     are automatically restored to their previous state on exit.
 
-    Note: Callers should use resolve_defined_ops() to convert operator names
-    to OpOverload objects before calling this function.
-
     Args:
-        overloads: List of resolved operator overload objects.
+        splitting_ops: List of operator names to partition on.
     """
-    if not overloads:
+    if not splitting_ops:
         logger.debug("No partition ops provided; skipping rule registration.")
         yield
         return
 
-    from torch._inductor.scheduler import (  # type: ignore
-        _custom_should_partition_fns,
-        register_should_partition_rule,
-    )
-
-    def _always_partition(*_args, **_kwargs):
-        return True
-
     # Save current state before registering
-    saved_rules = _custom_should_partition_fns.copy()
 
-    for overload in overloads:
-        register_should_partition_rule(
-            overload,
-            _always_partition,
-        )
+    saved_splitting_ops: list[str] = list(
+        torch._inductor.config.custom_should_partition_ops
+    )
+    torch._inductor.config.custom_should_partition_ops = splitting_ops
 
-    logger.debug("Registered inductor partition rules for %d operators", len(overloads))
+    logger.debug(
+        "Registered inductor partition rules for %d operators", len(splitting_ops)
+    )
 
     try:
         yield
     finally:
         # Clear and restore previous state
-        _custom_should_partition_fns.clear()
-        _custom_should_partition_fns.update(saved_rules)
+        torch._inductor.config.custom_should_partition_ops = saved_splitting_ops
         logger.debug("Restored previous partition rules state.")
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 3bc35a8f71983..dfda2adf1d3b0 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -113,27 +113,6 @@ class PostGradPassManager(CustomGraphPass):
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
 
-        # [HACK: Bug with Inductor graph partition and torch.compile cache]
-        # In PyTorch 2.9, torch.compile has a bug where the graph
-        # partition is not taken into account during caching.
-        # Because vLLM's Mode.VLLM_COMPILE is the only mode that uses
-        # Inductor graph partition, and VLLM_COMPILE implies there
-        # is a PostGradPassManager, we put the list of operators to graph
-        # partition into the PostGradPassManager's uuid (which
-        # then gets incorporated into Inductor's FX graph cache key).
-        # Remove this hack whenever torch.compile fixes it.
-
-        # This is the list of operators that vLLM asks Inductor to split.
-        self.inductor_splitting_ops = []
-        if (
-            config.compilation_config.use_inductor_graph_partition
-            and config.compilation_config.splitting_ops is not None
-        ):
-            # Sort them so we're not dependent on the ordering.
-            self.inductor_splitting_ops = sorted(
-                config.compilation_config.splitting_ops
-            )
-
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
@@ -144,16 +123,9 @@ class PostGradPassManager(CustomGraphPass):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {
-            "pass_config": self.pass_config.uuid(),
-            "passes": [],
-            "inductor_splitting_ops": [],
-        }
+        state = {"pass_config": self.pass_config.uuid(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
 
-        # See [HACK: Bug with Inductor graph partition and torch.compile cache]
-        state["inductor_splitting_ops"].extend(self.inductor_splitting_ops)
-
         return InductorPass.hash_dict(state)
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 6a5bd5ef4e07c..92cf16f259fe7 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -7,11 +7,12 @@ from collections import Counter
 from collections.abc import Callable
 from dataclasses import asdict, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
+import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -110,11 +111,52 @@ class PassConfig:
     """Whether to enable async TP."""
     enable_fi_allreduce_fusion: bool = False
     """Whether to enable flashinfer allreduce fusion."""
-    fi_allreduce_fusion_max_token_num: int = 16384
-    """Max number of tokens to used in flashinfer allreduce fusion."""
+    fi_allreduce_fusion_max_size_mb: float | None = None
+    """The threshold of the communicated tensor sizes under which
+    vllm should use flashinfer fused allreduce. Specified as a
+    float in MB.
+    Unspecified will fallback to default values 
+    which are compute capability and world size dependent.
+        FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
+            90: {
+                2: 64,  # 64MB
+                4: 2,  # 2MB
+                8: 1,  # 1MB
+            },
+            100: {
+                2: 64,  # 64MB
+                4: 32,  # 32MB
+                8: 1,  # 1MB
+            },
+        }, where key is the device capability"""
 
     # TODO(luka) better pass enabling system.
 
+    def flashinfer_max_size(self, world_size: int) -> int | None:
+        """
+        Returns the max communication size in bytes for flashinfer
+        allreduce fusion for the given world size. Returns None if world size
+        is not supported by configs as it's not supported by flashinfer.
+        """
+
+        MiB = 1024 * 1024
+        max_size_mb = self.fi_allreduce_fusion_max_size_mb
+        if max_size_mb is None:
+            max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
+
+        return int(max_size_mb * MiB) if max_size_mb is not None else None
+
+    @staticmethod
+    def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
+        from vllm.compilation.collective_fusion import FI_ALLREDUCE_FUSION_MAX_SIZE_MB
+        from vllm.platforms import current_platform
+
+        if not current_platform.is_cuda():
+            return {}
+        return FI_ALLREDUCE_FUSION_MAX_SIZE_MB.get(
+            current_platform.get_device_capability().to_int(), {}
+        )
+
     def uuid(self):
         """
         Produces a hash unique to the pass configuration.
@@ -135,6 +177,11 @@ class PassConfig:
                     "Fusion enabled but reshape elimination disabled. "
                     "Attention + quant (fp8) fusion might not work"
                 )
+            if self.enable_fi_allreduce_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "Allreduce + rms norm + quant (fp8) fusion might not work"
+                )
 
 
 @config
@@ -149,6 +196,7 @@ class CompilationConfig:
         - [`backend`][vllm.config.CompilationConfig.backend]
         - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
+        - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
     - CudaGraph capture:
         - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
         - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
@@ -208,6 +256,15 @@ class CompilationConfig:
     """The directory to store the compiled graph, to accelerate Inductor
     compilation. By default, it will use model-related information to generate
     a cache directory."""
+    compile_cache_save_format: Literal["binary", "unpacked"] = field(
+        default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT
+    )
+    """Format for saving torch compile cache:\n
+    - "binary": saves as binary file (multiprocess safe)\n
+    - "unpacked": saves as directory structure for inspection/debugging
+    (NOT multiprocess safe)\n
+    Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
+    """
     backend: str = ""
     """The backend for compilation. It needs to be a string:
 
@@ -257,6 +314,9 @@ class CompilationConfig:
 
     If None, defaults to attention ops for piecewise cudagraphs.
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
+    compile_mm_encoder: bool = True
+    """Whether or not to compile the multimodal encoder.
+    Currently, this only works for `Qwen2_5_vl`."""
 
     # Inductor capture
     use_inductor: bool | None = None
@@ -452,7 +512,7 @@ class CompilationConfig:
         "vllm::short_conv",
         "vllm::linear_attention",
         "vllm::plamo2_mamba_mixer",
-        "vllm::gdn_attention",
+        "vllm::gdn_attention_core",
         "vllm::kda_attention",
         "vllm::sparse_attn_indexer",
     ]
@@ -479,6 +539,7 @@ class CompilationConfig:
         factors.append(self.inductor_compile_config)
         factors.append(self.inductor_passes)
         factors.append(self.pass_config.uuid())
+        factors.append(self.compile_cache_save_format)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __repr__(self) -> str:
@@ -520,6 +581,16 @@ class CompilationConfig:
             return CUDAGraphMode[value.upper()]
         return value
 
+    @field_validator("compile_cache_save_format")
+    @classmethod
+    def validate_compile_cache_save_format(cls, value: str) -> str:
+        if value not in ("binary", "unpacked"):
+            raise ValueError(
+                f"compile_cache_save_format must be 'binary' or 'unpacked', "
+                f"got: {value}"
+            )
+        return value
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
diff --git a/vllm/config/load.py b/vllm/config/load.py
index d625c1ac987e7..e424f8c5edb62 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -40,6 +40,8 @@ class LoadConfig:
     more information.\n
     - "runai_streamer" will load the Safetensors weights using Run:ai Model
     Streamer.\n
+    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
+    files using Run:ai Model Streamer.\n
     - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
     - "sharded_state" will load weights from pre-sharded checkpoint files,
     supporting efficient loading of tensor-parallel models.\n
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 082f90653f5af..44c044c76168d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -168,12 +168,6 @@ class ModelConfig:
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    rope_scaling: dict[str, Any] = field(default_factory=dict)
-    """RoPE scaling configuration. For example,
-    `{"rope_type":"dynamic","factor":2.0}`."""
-    rope_theta: float | None = None
-    """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
-    theta improves the performance of the scaled model."""
     tokenizer_revision: str | None = None
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
@@ -338,8 +332,6 @@ class ModelConfig:
         factors.append(self.generation_config)
         factors.append(self.model_impl)
         factors.append(self.override_generation_config)
-        factors.append(self.rope_scaling)
-        factors.append(self.rope_theta)
         factors.append(self.video_pruning_rate)
         factors.append(self.enable_prompt_embeds)
 
@@ -481,25 +473,6 @@ class ModelConfig:
                     hf_overrides_kw[key] = value
             hf_overrides_fn = None
 
-        if self.rope_scaling:
-            hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling}
-            hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides_kw)
-            msg = (
-                "`--rope-scaling` will be removed in a future release. "
-                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`"
-            )
-            warnings.warn(DeprecationWarning(msg), stacklevel=2)
-        if self.rope_theta is not None:
-            hf_override = {"rope_theta": self.rope_theta}
-            hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides_kw)
-            msg = (
-                "`--rope-theta` will be removed in a future release. "
-                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`"
-            )
-            warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
         self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
 
         if (
@@ -1231,6 +1204,8 @@ class ModelConfig:
             "kimi_k2",
             "kimi_linear",
             "longcat_flash",
+            "pangu_ultra_moe",
+            "pangu_ultra_moe_mtp",
         ):
             return self.hf_text_config.kv_lora_rank is not None
         elif self.hf_text_config.model_type == "eagle":
@@ -1379,6 +1354,7 @@ class ModelConfig:
             or self.hf_config.model_type == "glm4_moe_mtp"
             or self.hf_config.model_type == "ernie_mtp"
             or self.hf_config.model_type == "qwen3_next_mtp"
+            or self.hf_config.model_type == "pangu_ultra_moe_mtp"
         ):
             total_num_hidden_layers = getattr(
                 self.hf_text_config, "num_nextn_predict_layers", 0
@@ -1483,6 +1459,12 @@ class ModelConfig:
         if chunk_size is None:
             # used by e.g. Mamba2, NemotronH, Zamba
             chunk_size = getattr(self.hf_text_config, "chunk_size", None)
+
+        # Since Mamba1 does not have a chunk notion
+        # we use a default chunk size of 1024.
+        if chunk_size is None:
+            chunk_size = 2048
+
         return chunk_size
 
     def get_multimodal_config(self) -> MultiModalConfig:
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 82d575f24690d..b19c8beeae3df 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -227,6 +227,17 @@ class ParallelConfig:
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
     needs to be divisible by dcp_size."""
 
+    dcp_kv_cache_interleave_size: int = 1
+    """Interleave size of kv_cache storage while using dcp or cp > 1,
+    store interleave_size tokens on (d)cp i,
+    then store next interleave_size tokens on (d)cp i+1.
+    Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
+    Interleave_size=block_size: block-level align, first fill the block on first rank,
+    token is stored on rank i+1 block j after rank i block j is full.
+    Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
+    Block_size should be divisible by dcp_kv_cache_interleave_size.
+    """
+
     _api_process_count: int = Field(default=1, gt=0)
     """
     The number of API processes initialized.
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index b837b830e774b..47aa343527b39 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -4,7 +4,7 @@
 import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 from pydantic import Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
@@ -17,6 +17,10 @@ from vllm.utils import (
     MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
     POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
 )
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.interface import SchedulerInterface
 
 logger = init_logger(__name__)
 
@@ -120,7 +124,7 @@ class SchedulerConfig:
 
     # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
     # (default) or "mod.custom_class".
-    scheduler_cls: str | type[object] = "vllm.v1.core.sched.scheduler.Scheduler"
+    scheduler_cls: str | type[object] = Field(default=None)
     """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
     the default scheduler. Can be a class directly or the path to a class of
     form "mod.custom_class"."""
@@ -132,12 +136,34 @@ class SchedulerConfig:
     """
 
     async_scheduling: bool = False
-    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
-    reduce the CPU overheads, leading to better latency and throughput. However,
-    async scheduling is currently not supported with some features such as
-    structured outputs, speculative decoding, and pipeline parallelism.
+    """If set to True, perform async scheduling. This helps to avoid gaps in
+    GPU utilization, leading to better latency and throughput.
+    Async scheduling is currently not supported with some features such as
+    speculative decoding and pipeline parallelism.
     """
 
+    def get_scheduler_cls(self) -> type["SchedulerInterface"]:
+        if self.scheduler_cls is None:
+            if self.async_scheduling:
+                from vllm.v1.core.sched.async_scheduler import AsyncScheduler
+
+                return AsyncScheduler
+            from vllm.v1.core.sched.scheduler import Scheduler
+
+            return Scheduler
+
+        # This warning can be removed once the Scheduler interface is
+        # finalized and we can maintain support for scheduler classes that
+        # implement it
+        logger.warning_once(
+            "Using custom scheduler class %s. This scheduler interface is "
+            "not public and compatibility may not be maintained.",
+            self.scheduler_cls,
+        )
+        if not isinstance(self.scheduler_cls, str):
+            return cast(type["SchedulerInterface"], self.scheduler_cls)
+        return resolve_obj_by_qualname(self.scheduler_cls)
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -161,6 +187,8 @@ class SchedulerConfig:
         "max_num_seqs",
         "max_model_len",
         "enable_chunked_prefill",
+        "scheduler_cls",
+        "async_scheduling",
         mode="wrap",
     )
     @classmethod
@@ -242,9 +270,6 @@ class SchedulerConfig:
                 self.long_prefill_token_threshold,
             )
 
-        if self.async_scheduling:
-            self.scheduler_cls = "vllm.v1.core.sched.async_scheduler.AsyncScheduler"
-
     @model_validator(mode="after")
     def _verify_args(self) -> Self:
         if (
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 1f956526dcdc6..31cdeabe501d2 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -12,7 +12,7 @@ from typing_extensions import Self
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils.import_utils import LazyLoader
+from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -41,7 +41,9 @@ SpeculativeMethod = Literal[
     "qwen3_next_mtp",
     "mimo_mtp",
     "longcat_flash_mtp",
+    "pangu_ultra_moe_mtp",
     "mtp",
+    "suffix",
 ]
 MTP_MODEL_TYPES = (
     "deepseek_mtp",
@@ -50,6 +52,7 @@ MTP_MODEL_TYPES = (
     "ernie_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
+    "pangu_ultra_moe_mtp",
 )
 
 
@@ -129,6 +132,27 @@ class SpeculativeConfig:
     draft_parallel_config: SkipValidation[ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    # Suffix decoding configuration
+    suffix_decoding_max_tree_depth: int = 24
+    """The maximum depth of the suffix decoding global and prompt trees. The
+    tree depth limits the sum of the prefix match and speculation lengths."""
+
+    suffix_decoding_max_cached_requests: int = 10000
+    """The maximum number of requests to cache in the global suffix tree. If
+    exceeded, will trigger eviction in FIFO order. If set to 0, the global
+    suffix tree is disabled and past responses are not cached (prompt trees
+    are still used)."""
+
+    suffix_decoding_max_spec_factor: float = 1.0
+    """The maximum spec factor for suffix decoding. The spec factor controls
+    speculation lengths based on the prefix match length: max_spec_tokens =
+    max_spec_factor * prefix_match_length."""
+
+    suffix_decoding_min_token_prob: float = 0.1
+    """The minimum token probability for suffix decoding. Will only speculate
+    tokens with estimated probability (based on frequency counts) greater than
+    or equal to this value."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -157,6 +181,13 @@ class SpeculativeConfig:
             hf_config.update(
                 {"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
             )
+        if hf_config.model_type in ("pangu_ultra_moe"):
+            hf_config.model_type = "pangu_ultra_moe_mtp"
+        if hf_config.model_type == "pangu_ultra_moe_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
+            )
 
         if hf_config.architectures[0] == "MiMoForCausalLM":
             hf_config.model_type = "mimo_mtp"
@@ -235,6 +266,8 @@ class SpeculativeConfig:
                     self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
+            elif self.method == "suffix":
+                self.model = "suffix"
             else:
                 raise ValueError(
                     "num_speculative_tokens was provided but without speculative model."
@@ -282,6 +315,8 @@ class SpeculativeConfig:
             # draft related config as None here.
             self.draft_model_config = self.target_model_config
             self.draft_parallel_config = self.target_parallel_config
+        elif self.method == "suffix":
+            self._validate_suffix_decoding()
         else:
             self.prompt_lookup_max = 0
             self.prompt_lookup_min = 0
@@ -430,6 +465,42 @@ class SpeculativeConfig:
                 )
         return self
 
+    def _validate_suffix_decoding(self):
+        if not has_arctic_inference():
+            raise ImportError(
+                "Arctic Inference is required for suffix decoding. "
+                "Install via `pip install arctic-inference==0.1.1`."
+            )
+        if self.num_speculative_tokens is None:
+            # Suffix decoding decides the actual number of speculative tokens
+            # dynamically and treats num_speculative_tokens as a maximum limit.
+            self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
+            logger.warning(
+                "Defaulted num_speculative_tokens to %s for suffix decoding.",
+                self.num_speculative_tokens,
+            )
+        # Validate values
+        if self.suffix_decoding_max_tree_depth < 1:
+            raise ValueError(
+                f"suffix_decoding_max_tree_depth="
+                f"{self.suffix_decoding_max_tree_depth} must be >= 1"
+            )
+        if self.suffix_decoding_max_cached_requests < 0:
+            raise ValueError(
+                f"suffix_decoding_max_cached_requests="
+                f"{self.suffix_decoding_max_cached_requests} must be >= 0"
+            )
+        if self.suffix_decoding_max_spec_factor < 0:
+            raise ValueError(
+                f"suffix_decoding_max_spec_factor="
+                f"{self.suffix_decoding_max_spec_factor} must be >= 0"
+            )
+        if not 0 <= self.suffix_decoding_min_token_prob <= 1:
+            raise ValueError(
+                f"suffix_decoding_min_token_prob="
+                f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
+            )
+
     @staticmethod
     def _maybe_override_draft_max_model_len(
         speculative_max_model_len: int | None,
@@ -582,6 +653,6 @@ class SpeculativeConfig:
 
     def __repr__(self) -> str:
         method = self.method
-        model = None if method == "ngram" else self.draft_model_config.model
+        model = None if method in ("ngram", "suffix") else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index eb1cc7220b8fe..9530d3d81e15d 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -37,6 +37,9 @@ class StructuredOutputsConfig:
     reasoning_parser: str = ""
     """Select the reasoning parser depending on the model that you're using.
     This is used to parse the reasoning content into OpenAI API format."""
+    reasoning_parser_plugin: str = ""
+    """Path to a dynamically reasoning parser plugin that can be dynamically
+    loaded and registered."""
     enable_in_reasoning: bool = False
     """Whether to use structured input for reasoning."""
 
@@ -60,6 +63,22 @@ class StructuredOutputsConfig:
 
     @model_validator(mode="after")
     def _validate_structured_output_config(self) -> Self:
+        # Import here to avoid circular import
+        from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+
+        if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3:
+            ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin)
+
+        valid_reasoning_parsers = ReasoningParserManager.list_registered()
+        if (
+            self.reasoning_parser != ""
+            and self.reasoning_parser not in valid_reasoning_parsers
+        ):
+            raise ValueError(
+                f"invalid reasoning parser: {self.reasoning_parser} "
+                f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
+            )
+
         if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
             raise ValueError(
                 "disable_any_whitespace is only supported for "
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ee91cb0ef5c36..0fca967d90838 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -353,6 +353,53 @@ class VllmConfig:
                 self.model_config, self.load_config
             )
 
+        executor_backend = self.parallel_config.distributed_executor_backend
+        executor_supports_async_sched = executor_backend in (
+            "mp",
+            "uni",
+            "external_launcher",
+        )
+
+        if self.scheduler_config.async_scheduling:
+            # Async scheduling explicitly enabled, hard fail any incompatibilities.
+            if self.parallel_config.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Async scheduling is not yet compatible with "
+                    "pipeline_parallel_size > 1."
+                )
+            if self.speculative_config is not None:
+                raise ValueError(
+                    "Async scheduling is not yet compatible with speculative decoding."
+                )
+            if not executor_supports_async_sched:
+                raise ValueError(
+                    "Currently, async scheduling only supports `mp`, `uni`, or "
+                    "`external_launcher` distributed executor backend, but you chose "
+                    f"`{executor_backend}`."
+                )
+        elif self.scheduler_config.async_scheduling is None:
+            # Enable async scheduling unless there is an incompatible option.
+            # NOTE: we won't reach here until async scheduling is enabled by default.
+            if (
+                self.parallel_config.pipeline_parallel_size > 1
+                or self.speculative_config is not None
+            ):
+                logger.warning(
+                    "Async scheduling is not yet supported with speculative decoding "
+                    " or pipeline_parallel_size > 1 and will be disabled."
+                )
+                self.scheduler_config.async_scheduling = False
+            elif not executor_supports_async_sched:
+                logger.warning(
+                    "Async scheduling will be disabled because it is not supported "
+                    "with the `%s` distributed executor backend (only `mp`, `uni`, and "
+                    "`external_launcher` are supported).",
+                    executor_backend,
+                )
+                self.scheduler_config.async_scheduling = False
+            else:
+                self.scheduler_config.async_scheduling = True
+
         from vllm.platforms import current_platform
 
         if (
@@ -467,7 +514,7 @@ class VllmConfig:
                 self.speculative_config is not None
                 and self.speculative_config.use_eagle()
             ):
-                raise NotImplementedError(
+                raise ValueError(
                     "Fast prefill optimization for KV sharing is not "
                     "compatible with EAGLE as EAGLE requires correct logits "
                     "for all tokens while fast prefill gives incorrect logits "
@@ -491,7 +538,7 @@ class VllmConfig:
                     )
                 if not getattr(self.model_config.hf_config, "is_causal", True):
                     disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention supports chunked "
+                        "Only models using causal attention support chunked "
                         "prefill and prefix caching; disabling both."
                     )
             elif self.model_config.is_encoder_decoder:
@@ -561,6 +608,25 @@ class VllmConfig:
             )
         current_platform.check_and_update_config(self)
 
+        # If DCP, ensure the block size is right.
+        if self.parallel_config.decode_context_parallel_size > 1:
+            assert (
+                self.parallel_config.dcp_kv_cache_interleave_size
+                <= self.cache_config.block_size
+                and self.cache_config.block_size
+                % self.parallel_config.dcp_kv_cache_interleave_size
+                == 0
+            ), (
+                f"Block_size({self.cache_config.block_size}) should be greater "
+                "than or equal to and divisible by dcp_kv_cache_interleave_size "
+                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+            )
+
+        assert (
+            self.parallel_config.dcp_kv_cache_interleave_size == 1
+            or self.speculative_config is None
+        ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
+
         # Do this after all the updates to compilation_config.mode
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             self.compilation_config.set_splitting_ops_for_v1()
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9566dbac7f22f..3a849da70e4cb 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -266,14 +266,14 @@ class DeviceCommunicatorBase:
             module
             for module in model.modules()
             # TODO(bnell): Should use isinstance but can't.  Maybe search for
-            # presence of quant_method.init_prepare_finalize?
+            # presence of quant_method.maybe_init_modular_kernel?
             if (
                 module.__class__.__name__ == "FusedMoE"
                 or module.__class__.__name__ == "SharedFusedMoE"
             )
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize(module)
+            module.maybe_init_modular_kernel()
 
     def dispatch(
         self,
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index f92b3d34af0f1..5046cac2e90a7 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -49,6 +49,16 @@ def to_bytes_big(value: int, size: int) -> bytes:
 logger = init_logger(__name__)
 
 
+def long_wait_time_msg(threshold: int) -> str:
+    return (
+        "No available shared memory broadcast block found "
+        f"in {threshold} seconds. This typically happens "
+        "when some processes are hanging or doing some "
+        "time-consuming work (e.g. compilation, "
+        "weight/kv cache quantization)."
+    )
+
+
 class SpinTimer:
     def record_activity(self):
         pass
@@ -422,11 +432,7 @@ class MessageQueue:
                     # if we wait for a long time, log a message
                     if elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:
                         logger.info(
-                            "No available shared memory broadcast block found"
-                            " in %s seconds. This typically happens when some"
-                            " processes are hanging or doing some"
-                            " time-consuming work (e.g. compilation)",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
                         )
                         n_warning += 1
 
@@ -493,11 +499,7 @@ class MessageQueue:
                         elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning
                     ):
                         logger.info(
-                            "No available shared memory broadcast block found"
-                            " in %s seconds. This typically happens when some"
-                            " processes are hanging or doing some"
-                            " time-consuming work (e.g. compilation).",
-                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
                         )
                         n_warning += 1
 
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 17716e8a07ac0..526d3ceac7b8f 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -33,7 +33,7 @@ from dataclasses import dataclass
 import torch
 from torch.distributed import ProcessGroup, all_reduce
 
-from vllm.config import ParallelConfig
+from vllm.config import ModelConfig, ParallelConfig
 from vllm.distributed.parallel_state import (
     get_ep_group,
     get_node_count,
@@ -50,7 +50,7 @@ logger = init_logger(__name__)
 
 
 @dataclass
-class EplbState:
+class EplbModelState:
     """EPLB metrics."""
 
     physical_to_logical_map: torch.Tensor
@@ -130,34 +130,46 @@ class EplbState:
     See:
     https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856
     """
-    expert_load_window_step: int = 0
-    """
-    Current step in the sliding window.
+    model_name: str
+    model: MixtureOfExperts
 
-    Different from `expert_rearrangement_step`, each EP rank may have its own
-    `expert_load_window_step`.
+
+class EplbState:
     """
-    expert_load_window_size: int = 0
-    """
-    Size of the expert load sliding window.
-    This is a constant and is taken from the config.
+    EplbState of each expert parallel model. Key is the model config hash.
     """
 
-    expert_rearrangement_step: int = 0
-    """
-    Steps after last rearrangement.
-    Will trigger a rearrangement if it exceeds the threshold.
+    def __init__(self, parallel_config: ParallelConfig, device: torch.device):
+        self.parallel_config = parallel_config
+        self.device = device
+        self.model_states: dict[str, EplbModelState] = {}
+        """
+        Current step in the sliding window.
 
-    NOTE: Keep in mind that all EP ranks need to have the same
-    `expert_rearrangement_step` value to ensure synchronization.
-    Otherwise, the rearrangement will hang at collective
-    communication calls.
-    """
-    expert_rearrangement_step_interval: int = 0
-    """
-    Interval for expert rearrangement steps.
-    This is a constant and is taken from the config.
-    """
+        Different from `expert_rearrangement_step`, 
+        each EP rank may have its own `expert_load_window_step`.
+        """
+        self.expert_load_window_step: int = 0
+        """
+        Size of the expert load sliding window.
+        This is a constant and is taken from the config.
+        """
+        self.expert_load_window_size: int = 0
+        """
+        Steps after last rearrangement.
+        Will trigger a rearrangement if it exceeds the threshold.
+
+        NOTE: Keep in mind that all EP ranks need to have the same
+        `expert_rearrangement_step` value to ensure synchronization.
+        Otherwise, the rearrangement will hang at collective
+        communication calls.
+        """
+        self.expert_rearrangement_step: int = 0
+        """
+        Interval for expert rearrangement steps.
+        This is a constant and is taken from the config.
+        """
+        self.expert_rearrangement_step_interval: int = 0
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -179,26 +191,63 @@ class EplbState:
         ]
         return global_physical_to_logical_map
 
-    @classmethod
-    def build(
-        cls,
+    def validate_ep_configuration(self, new_model: MixtureOfExperts):
+        """
+        Validate that the expert parallel configuration of
+        the new model is the same as the existing models.
+        """
+        if len(self.model_states) > 0:
+            model = next(iter(self.model_states.values())).model
+            if (
+                model.num_routed_experts != new_model.num_routed_experts
+                or model.num_redundant_experts != new_model.num_redundant_experts
+                or model.num_physical_experts != new_model.num_physical_experts
+                or model.num_logical_experts != new_model.num_logical_experts
+                or model.num_expert_groups != new_model.num_expert_groups
+            ):
+                raise RuntimeError(
+                    "Model: {} "
+                    "with config {} "
+                    "{} {} {} {} "
+                    "mismatch with new model {} "
+                    "with config {} "
+                    "{} {} {} {}".format(
+                        type(model),
+                        model.num_routed_experts,
+                        model.num_redundant_experts,
+                        model.num_physical_experts,
+                        model.num_logical_experts,
+                        model.num_expert_groups,
+                        type(new_model),
+                        new_model.num_routed_experts,
+                        new_model.num_redundant_experts,
+                        new_model.num_physical_experts,
+                        new_model.num_logical_experts,
+                        new_model.num_expert_groups,
+                    )
+                )
+
+    def add_model(
+        self,
         model: MixtureOfExperts,
-        device: torch.device,
-        parallel_config: ParallelConfig,
+        model_config: ModelConfig,
         global_expert_load: torch.Tensor | None = None,
         old_global_expert_indices: torch.Tensor | None = None,
         rank_mapping: dict[int, int] | None = None,
-    ) -> "EplbState":
+    ):
         """
         Build the initial EPLB state.
         """
-        physical_to_logical_map_list = cls.build_initial_global_physical_to_logical_map(
-            model.num_routed_experts,
-            model.num_redundant_experts,
+        self.validate_ep_configuration(model)
+        physical_to_logical_map_list = (
+            EplbState.build_initial_global_physical_to_logical_map(
+                model.num_routed_experts,
+                model.num_redundant_experts,
+            )
         )
         physical_to_logical_map = torch.tensor(
             physical_to_logical_map_list,
-            device=device,
+            device=self.device,
         )
         # Assuming 8 GPUs per node, this supports up to
         # (1023 + 1) / 8 = 128 nodes for now.
@@ -212,11 +261,11 @@ class EplbState:
         logical_to_physical_map = torch.full(
             (model.num_logical_experts, max_slots_per_logical_expert),
             -1,
-            device=device,
+            device=self.device,
         )
         logical_replica_count = torch.zeros(
             (model.num_logical_experts,),
-            device=device,
+            device=self.device,
             dtype=torch.long,
         )
 
@@ -255,18 +304,25 @@ class EplbState:
         expert_load_pass = torch.zeros(
             (model.num_moe_layers, model.num_physical_experts),
             dtype=torch.int32,
-            device=device,
+            device=self.device,
         )
-        expert_load_window_size = parallel_config.eplb_config.window_size
+        self.expert_load_window_size = self.parallel_config.eplb_config.window_size
         expert_load_window = torch.zeros(
-            (expert_load_window_size, model.num_moe_layers, model.num_physical_experts),
+            (
+                self.expert_load_window_size,
+                model.num_moe_layers,
+                model.num_physical_experts,
+            ),
             dtype=torch.int32,
-            device=device,
+            device=self.device,
         )
 
         # Set the initial progress of rearrangement to 3/4
-        eplb_step_interval = parallel_config.eplb_config.step_interval
-        expert_rearrangement_step = max(0, eplb_step_interval - eplb_step_interval // 4)
+        eplb_step_interval = self.parallel_config.eplb_config.step_interval
+        self.expert_rearrangement_step = max(
+            0, eplb_step_interval - eplb_step_interval // 4
+        )
+        self.expert_rearrangement_step_interval = eplb_step_interval
 
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
@@ -309,7 +365,7 @@ class EplbState:
                 (0, logical_to_physical_map.shape[-1] - max_physical_slots),
                 value=-1,
             )
-            physical_to_logical_map = new_physical_to_logical_map.to(device)
+            physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
 
@@ -327,22 +383,20 @@ class EplbState:
                 False,
                 rank_mapping,
             )
-            expert_rearrangement_step = 0
+            self.expert_rearrangement_step = 0
 
-        return cls(
+        self.model_states[model_config.compute_hash()] = EplbModelState(
             physical_to_logical_map,
             logical_to_physical_map,
             logical_replica_count,
             expert_load_pass,
             expert_load_window,
-            expert_load_window_size=expert_load_window_size,
-            expert_rearrangement_step=expert_rearrangement_step,
-            expert_rearrangement_step_interval=eplb_step_interval,
+            model_config.model,
+            model,
         )
 
     def step(
         self,
-        model: MixtureOfExperts,
         is_dummy: bool = False,
         is_profile: bool = False,
         log_stats: bool = False,
@@ -351,7 +405,6 @@ class EplbState:
         Step the EPLB state.
 
         Args:
-            model (MixtureOfExperts): The MoE model.
             is_dummy (bool): If `True`, this is a dummy step and the load
                 metrics recorded in this forward pass will not count.
                 Defaults to `False`.
@@ -369,60 +422,66 @@ class EplbState:
         """
 
         if is_profile:
-            self.rearrange(model, is_profile=True)
+            self.rearrange(is_profile=True)
             return
 
         if is_dummy:
             # Do not record load metrics for dummy steps
-            self.expert_load_pass.zero_()
+            for eplb_model_state in self.model_states.values():
+                eplb_model_state.expert_load_pass.zero_()
 
         if log_stats:
-            # total_expert_load_pass: (num_moe_layers, num_physical_experts)
-            total_expert_load_pass = self.expert_load_pass.clone()
-
-            # Collect load metrics from all ranks
+            # Sync the expert load pass for each model (main and drafter).
+            # expert_load_pass: (num_moe_layers, num_physical_experts)
+            expert_load_pass_list = self._sync_load_pass()
             ep_group = get_ep_group().device_group
-            all_reduce(total_expert_load_pass, group=ep_group)
-
-            # num_tokens_per_rank: (num_moe_layers, num_ranks)
-            num_tokens_per_rank = (
-                total_expert_load_pass.reshape(
-                    total_expert_load_pass.shape[0], ep_group.size(), -1
+            for expert_load_pass, eplb_model_state in zip(
+                expert_load_pass_list, self.model_states.values()
+            ):
+                # num_tokens_per_rank: (num_moe_layers, num_ranks)
+                num_tokens_per_rank = (
+                    expert_load_pass.reshape(
+                        expert_load_pass.shape[0], ep_group.size(), -1
+                    )
+                    .sum(dim=-1)
+                    .float()
                 )
-                .sum(dim=-1)
-                .float()
-            )
 
-            # Compute balancedness ratio:
-            # for each layer:
-            #   (mean load across ranks) / (max load across ranks)
-            avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0)
-            max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(dim=0)
+                # Compute balancedness ratio:
+                # for each layer:
+                #   (mean load across ranks) / (max load across ranks)
+                avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0)
+                max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(dim=0)
 
-            # Just to make type checker happy
-            tokens_tensors: list[float] = torch.stack(
-                [avg_tokens_tensor, max_tokens_tensor]
-            ).tolist()
-            avg_tokens, max_tokens = tokens_tensors
-            balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0
+                # Just to make type checker happy
+                tokens_tensors: list[float] = torch.stack(
+                    [avg_tokens_tensor, max_tokens_tensor]
+                ).tolist()
+                avg_tokens, max_tokens = tokens_tensors
+                balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0
 
-            if ep_group.rank() == 0:
-                logger.info(
-                    "EPLB step: avg_tokens=%.2f, max_tokens=%d, balancedness=%.4f",
-                    avg_tokens,
-                    max_tokens,
-                    balancedness,
-                )
+                if ep_group.rank() == 0:
+                    logger.info(
+                        "EPLB step: %d for model %s: avg_tokens=%.2f, "
+                        "max_tokens=%d, balancedness=%.4f",
+                        self.expert_rearrangement_step,
+                        eplb_model_state.model_name,
+                        avg_tokens,
+                        max_tokens,
+                        balancedness,
+                    )
 
         # Update the expert load sliding window
         if not is_dummy:
-            self.expert_load_window[self.expert_load_window_step] = (
-                self.expert_load_pass.clone()
-            )
+            for eplb_model_state in self.model_states.values():
+                eplb_model_state.expert_load_window[self.expert_load_window_step] = (
+                    eplb_model_state.expert_load_pass.clone()
+                )
+                eplb_model_state.expert_load_pass.zero_()
+
             self.expert_load_window_step += 1
             if self.expert_load_window_step >= self.expert_load_window_size:
                 self.expert_load_window_step = 0
-            self.expert_load_pass.zero_()
 
         # Step the expert rearrangement step
         # Note that even if this is a dummy step, we still increment the
@@ -431,18 +490,30 @@ class EplbState:
         self.expert_rearrangement_step += 1
         if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
             self.expert_rearrangement_step = 0
-            self.rearrange(model)
+            self.rearrange()
 
     def rearrange(
         self,
-        model: MixtureOfExperts,
         is_profile: bool = False,
         execute_shuffle: bool = True,
-        global_expert_load: torch.Tensor | None = None,
+        global_expert_loads: list[torch.Tensor] | None = None,
         rank_mapping: dict[int, int] | None = None,
     ) -> torch.Tensor | None:
         """
         Rearrange the experts according to the current load.
+
+        Args:
+            is_profile (bool): If `True`, perform a dummy rearrangement.
+                This is used in `profile_run` to reserve enough memory,
+                no memory movement will be performed. Default is False.
+            execute_shuffle (bool): If `True`, execute the shuffle
+                in elastic expert parallel (EEP). Default is True.
+            global_expert_loads (list[torch.Tensor] | None): The global expert
+                loads when scaling is done in EEP.
+                List of expert loads for the main and drafter
+                (when spec decode is used) models.
+            rank_mapping (dict[int, int] | None): The rank mapping
+                when scaling is done in EEP.
         """
 
         ep_group = get_ep_group().device_group
@@ -455,53 +526,71 @@ class EplbState:
             time_start = time.perf_counter()
             logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
 
-        if global_expert_load is None:
+        if global_expert_loads is None:
             # Map the physical expert load to global logical experts
-            logical_expert_load_window = torch.zeros(
-                self.expert_load_window_size,
-                model.num_moe_layers,
-                model.num_logical_experts,
-                dtype=self.expert_load_window.dtype,
-                device=self.expert_load_window.device,
-            )
-            logical_expert_load_window.scatter_add_(
-                dim=-1,
-                index=self.physical_to_logical_map.unsqueeze(0)
-                .expand_as(self.expert_load_window)
-                .long(),
-                src=self.expert_load_window,
-            )
-
+            global_expert_load_windows = []
             if not execute_shuffle:
-                metadata = torch.tensor(
-                    [
-                        model.num_moe_layers,
-                        model.num_logical_experts,
-                        self.physical_to_logical_map.shape[1],
-                    ],
-                    dtype=torch.int32,
-                    device="cpu",
+                num_models = torch.tensor(
+                    [len(self.model_states)], dtype=torch.int32, device="cpu"
                 )
                 torch.distributed.broadcast(
-                    metadata, group=get_ep_group().cpu_group, group_src=0
+                    num_models, group=get_ep_group().cpu_group, group_src=0
                 )
 
-            # Perform all-reduce to get the expert load across all ranks
-            global_expert_load_window = logical_expert_load_window.sum(dim=0)
-            all_reduce(global_expert_load_window, group=ep_group)
+            for eplb_model_state in self.model_states.values():
+                logical_expert_load_window = torch.zeros(
+                    self.expert_load_window_size,
+                    eplb_model_state.model.num_moe_layers,
+                    eplb_model_state.model.num_logical_experts,
+                    dtype=eplb_model_state.expert_load_window.dtype,
+                    device=eplb_model_state.expert_load_window.device,
+                )
+                logical_expert_load_window.scatter_add_(
+                    dim=-1,
+                    index=eplb_model_state.physical_to_logical_map.unsqueeze(0)
+                    .expand_as(eplb_model_state.expert_load_window)
+                    .long(),
+                    src=eplb_model_state.expert_load_window,
+                )
 
+                if not execute_shuffle:
+                    metadata = torch.tensor(
+                        [
+                            eplb_model_state.model.num_moe_layers,
+                            eplb_model_state.model.num_logical_experts,
+                            eplb_model_state.physical_to_logical_map.shape[1],
+                        ],
+                        dtype=torch.int32,
+                        device="cpu",
+                    )
+                    torch.distributed.broadcast(
+                        metadata, group=get_ep_group().cpu_group, group_src=0
+                    )
+
+                global_expert_load_window = logical_expert_load_window.sum(dim=0)
+                global_expert_load_windows.append(global_expert_load_window)
+            # Perform all-reduce to get the expert load across all ranks for each model
+            global_expert_load_windows = self._allreduce_list(
+                global_expert_load_windows
+            )
             if not execute_shuffle:
-                # (num_moe_layers, old_num_physical_experts)
-                old_global_expert_indices = self.physical_to_logical_map
-                torch.distributed.broadcast(
-                    old_global_expert_indices, group=ep_group, group_src=0
-                )
-                return global_expert_load_window
+                for eplb_model_state, global_expert_load_window in zip(
+                    self.model_states.values(), global_expert_load_windows
+                ):
+                    # (num_moe_layers, old_num_physical_experts)
+                    old_global_expert_indices = eplb_model_state.physical_to_logical_map
+                    torch.distributed.broadcast(
+                        old_global_expert_indices, group=ep_group, group_src=0
+                    )
+            if not execute_shuffle:
+                return global_expert_load_windows
         else:
             assert execute_shuffle
-            global_expert_load_window = global_expert_load
+            global_expert_load_windows = global_expert_loads
 
         # TODO(bowen): Treat differently for prefill and decode nodes
+        eplb_model_state = next(iter(self.model_states.values()))
+        model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
@@ -526,48 +615,64 @@ class EplbState:
                 f"{num_gpus=}, {num_nodes=}"
             )
 
-        # Get new expert mappings
-        (
-            new_physical_to_logical_map,
-            new_logical_to_physical_map,
-            new_logical_replica_count,
-        ) = rebalance_experts(
-            global_expert_load_window,
-            num_replicas,
-            num_groups,
-            num_nodes,
-            num_gpus,
-        )
-
-        # Update expert weights
-        rearrange_expert_weights_inplace(
-            self.physical_to_logical_map,
-            new_physical_to_logical_map,
-            model.expert_weights,
-            ep_group,
-            is_profile,
-            rank_mapping,
-        )
-
-        if not is_profile:
-            if (
-                self.physical_to_logical_map.shape[1]
-                != new_physical_to_logical_map.shape[1]
-            ):
-                self.physical_to_logical_map = new_physical_to_logical_map.to(
-                    self.physical_to_logical_map.device
-                )
-            else:
-                self.physical_to_logical_map.copy_(new_physical_to_logical_map)
-            max_physical_slots = new_logical_to_physical_map.shape[-1]
-            assert max_physical_slots <= self.logical_to_physical_map.shape[-1]
-            new_logical_to_physical_map = torch.nn.functional.pad(
+        for eplb_model_state, global_expert_load_window in zip(
+            self.model_states.values(), global_expert_load_windows
+        ):
+            # Get new expert mappings for the model
+            (
+                new_physical_to_logical_map,
                 new_logical_to_physical_map,
-                (0, self.logical_to_physical_map.shape[-1] - max_physical_slots),
-                value=-1,
+                new_logical_replica_count,
+            ) = rebalance_experts(
+                global_expert_load_window,
+                num_replicas,
+                num_groups,
+                num_nodes,
+                num_gpus,
             )
-            self.logical_to_physical_map.copy_(new_logical_to_physical_map)
-            self.logical_replica_count.copy_(new_logical_replica_count)
+
+            # Update expert weights
+            rearrange_expert_weights_inplace(
+                eplb_model_state.physical_to_logical_map,
+                new_physical_to_logical_map,
+                eplb_model_state.model.expert_weights,
+                ep_group,
+                is_profile,
+                rank_mapping,
+            )
+
+            if not is_profile:
+                if (
+                    eplb_model_state.physical_to_logical_map.shape[1]
+                    != new_physical_to_logical_map.shape[1]
+                ):
+                    eplb_model_state.physical_to_logical_map = (
+                        new_physical_to_logical_map.to(
+                            eplb_model_state.physical_to_logical_map.device
+                        )
+                    )
+                else:
+                    eplb_model_state.physical_to_logical_map.copy_(
+                        new_physical_to_logical_map
+                    )
+                max_physical_slots = new_logical_to_physical_map.shape[-1]
+                assert (
+                    max_physical_slots
+                    <= eplb_model_state.logical_to_physical_map.shape[-1]
+                )
+                new_logical_to_physical_map = torch.nn.functional.pad(
+                    new_logical_to_physical_map,
+                    (
+                        0,
+                        eplb_model_state.logical_to_physical_map.shape[-1]
+                        - max_physical_slots,
+                    ),
+                    value=-1,
+                )
+                eplb_model_state.logical_to_physical_map.copy_(
+                    new_logical_to_physical_map
+                )
+                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
         if is_main_rank:
             assert time_start is not None
@@ -581,32 +686,118 @@ class EplbState:
         return None
 
     @staticmethod
-    def recv_state() -> tuple[torch.Tensor, torch.Tensor]:
+    def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """
         Receive the expert load and old placement from the master rank.
         """
         ep_group = get_ep_group()
-        metadata = torch.empty(3, dtype=torch.int32, device="cpu")
-        torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0)
-        num_moe_layers, num_logical_experts, num_old_physical_experts = (
-            metadata.tolist()
-        )
-        global_expert_load = torch.zeros(
-            (num_moe_layers, num_logical_experts),
-            dtype=torch.int64,
-            device=ep_group.device,
-        )
-        all_reduce(global_expert_load, group=ep_group.device_group)
-        old_global_expert_indices = torch.empty(
-            (num_moe_layers, num_old_physical_experts),
-            dtype=torch.int64,
-            device=ep_group.device,
-        )
+        num_models = torch.empty(1, dtype=torch.int32, device="cpu")
+        torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0)
+        num_models = num_models.item()
+        global_expert_loads = []
+        old_global_expert_indices_per_model = []
+        for _ in range(num_models):
+            metadata = torch.empty(3, dtype=torch.int32, device="cpu")
+            torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0)
+            num_moe_layers, num_logical_experts, num_old_physical_experts = (
+                metadata.tolist()
+            )
+            global_expert_load = torch.zeros(
+                (num_moe_layers, num_logical_experts),
+                dtype=torch.int64,
+                device=ep_group.device,
+            )
+            all_reduce(global_expert_load, group=ep_group.device_group)
+            old_global_expert_indices = torch.empty(
+                (num_moe_layers, num_old_physical_experts),
+                dtype=torch.int64,
+                device=ep_group.device,
+            )
+            torch.distributed.broadcast(
+                old_global_expert_indices,
+                group=ep_group.device_group,
+                group_src=0,
+            )
+            global_expert_loads.append(global_expert_load)
+            old_global_expert_indices_per_model.append(old_global_expert_indices)
+        return global_expert_loads, old_global_expert_indices_per_model
+
+    @classmethod
+    def get_eep_state(
+        cls, parallel_config: ParallelConfig
+    ) -> tuple[
+        list[torch.Tensor] | None,
+        list[torch.Tensor] | None,
+        dict[int, int] | None,
+    ]:
+        num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu")
         torch.distributed.broadcast(
-            old_global_expert_indices, group=ep_group.device_group, group_src=0
+            num_local_physical_experts,
+            group=get_ep_group().cpu_group,
+            group_src=0,
+        )
+        num_local_physical_experts = int(num_local_physical_experts.item())
+        new_ep_size = get_ep_group().world_size
+        global_expert_loads, old_global_expert_indices_per_model = (
+            EplbState.recv_state()
         )
 
-        return global_expert_load, old_global_expert_indices
+        # EP configuration for all models has to be the same so as eplb config
+        num_logical_experts = global_expert_loads[0].shape[1]
+        parallel_config.eplb_config.num_redundant_experts = (
+            num_local_physical_experts * new_ep_size - num_logical_experts
+        )
+        assert (
+            old_global_expert_indices_per_model[0].shape[1] % num_local_physical_experts
+            == 0
+        )
+        old_ep_size = (
+            old_global_expert_indices_per_model[0].shape[1]
+            // num_local_physical_experts
+        )
+        rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
+        return (
+            global_expert_loads,
+            old_global_expert_indices_per_model,
+            rank_mapping,
+        )
+
+    def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]:
+        """
+        All-reduce a list of tensors.
+        """
+        if len(tensor_list) == 1:
+            all_reduce(tensor_list[0], group=get_ep_group().device_group)
+            return tensor_list
+        assert all(t.dim() == 2 for t in tensor_list), "All tensors must be 2D."
+        assert all(t.shape[1] == tensor_list[0].shape[1] for t in tensor_list), (
+            "All tensors must have the same shape[1]."
+        )
+        # Concatenate, all_reduce, then unpack to original shapes.
+        # We assume all tensors are 2D and shape[1] (num_physical_experts)
+        # is the same across all models.
+        shapes = [t.shape for t in tensor_list]
+        concat_tensor = torch.cat(tensor_list, dim=0)
+
+        ep_group = get_ep_group().device_group
+        all_reduce(concat_tensor, group=ep_group)
+
+        all_reduce_list = []
+        offset = 0
+        for shape in shapes:
+            all_reduce_list.append(concat_tensor[offset : offset + shape[0], :])
+            offset += shape[0]
+        return all_reduce_list
+
+    def _sync_load_pass(self) -> list[torch.Tensor]:
+        """
+        Sync the expert load pass across all ranks for log stats.
+        Doesn't update the expert load pass in eplb_model_state.
+        """
+        load_pass_list = []
+        for eplb_model_state in self.model_states.values():
+            load_pass_list.append(eplb_model_state.expert_load_pass.clone())
+        return self._allreduce_list(load_pass_list)
 
 
 def _node_count_with_rank_mapping(
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index c64996f13cd5d..494a4d3c33aa4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -3,10 +3,8 @@
 
 import importlib
 from collections.abc import Callable
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Optional, cast
 
-import vllm.envs as envs
-from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import (
     KVConnectorBase,
     KVConnectorBaseType,
@@ -16,9 +14,12 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
     supports_hma,
 )
 from vllm.logger import init_logger
+from vllm.utils.func_utils import supports_kw
 
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.config.kv_transfer import KVTransferConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = init_logger(__name__)
 
@@ -41,19 +42,16 @@ class KVConnectorFactory:
     @classmethod
     def create_connector(
         cls,
-        config: VllmConfig,
+        config: "VllmConfig",
         role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
     ) -> KVConnectorBase:
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Attempting to initialize a V1 Connector, "
-                f"but found {envs.VLLM_USE_V1=}"
-            )
-
         kv_transfer_config = config.kv_transfer_config
         if kv_transfer_config is None:
             raise ValueError("kv_transfer_config must be set to create a connector")
-        connector_cls = cls.get_connector_class(kv_transfer_config)
+        connector_cls, compat_sig = cls._get_connector_class_with_compat(
+            kv_transfer_config
+        )
 
         # check if the connector supports HMA
         hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager
@@ -76,7 +74,12 @@ class KVConnectorFactory:
         # - Co-locate with worker process
         # - Should only be used inside the forward context & attention layer
         # We build separately to enforce strict separation
-        return connector_cls(config, role)
+        if compat_sig:
+            # Old signature: __init__(self, vllm_config, role)
+            return connector_cls(config, role)
+        else:
+            # New signature: __init__(self, vllm_config, role, kv_cache_config)
+            return connector_cls(config, role, kv_cache_config)
 
     @classmethod
     def get_connector_class_by_name(
@@ -97,13 +100,13 @@ class KVConnectorFactory:
         return cls._registry[connector_name]()
 
     @classmethod
-    def get_connector_class(
+    def _get_connector_class_with_compat(
         cls, kv_transfer_config: "KVTransferConfig"
-    ) -> type[KVConnectorBaseType]:
-        """Get the connector class by name."""
+    ) -> tuple[type[KVConnectorBaseType], bool]:
         connector_name = kv_transfer_config.kv_connector
         if connector_name is None:
             raise ValueError("Connector name is not set in KVTransferConfig")
+        compat_sig = False
         if connector_name in cls._registry:
             connector_cls = cls._registry[connector_name]()
         else:
@@ -118,6 +121,21 @@ class KVConnectorFactory:
                     f"Class {connector_name} not found in {connector_module_path}"
                 ) from e
             connector_cls = cast(type[KVConnectorBaseType], connector_cls)
+            if not supports_kw(connector_cls, "kv_cache_config"):
+                compat_sig = True
+                logger.warning(
+                    "Connector %s uses deprecated signature with 2 required arguments. "
+                    "Please update to include kv_cache_config as the second argument.",
+                    connector_cls.__name__,
+                )
+        return connector_cls, compat_sig
+
+    @classmethod
+    def get_connector_class(
+        cls, kv_transfer_config: "KVTransferConfig"
+    ) -> type[KVConnectorBaseType]:
+        """Get the connector class by name."""
+        connector_cls, _ = cls._get_connector_class_with_compat(kv_transfer_config)
         return connector_cls
 
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 7464f8469c3b5..b8eb5ea3b4939 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -4,8 +4,6 @@
 KV cache helper for store.
 """
 
-from collections.abc import Sequence
-from concurrent.futures import CancelledError, Future
 from typing import TYPE_CHECKING, Literal
 
 import torch
@@ -219,43 +217,6 @@ class KVOutputAggregator:
 
         return output
 
-    def async_aggregate(
-        self,
-        output_futures: Sequence[Future[ModelRunnerOutput | None]],
-        output_rank: int = 0,
-    ) -> Future[ModelRunnerOutput | None]:
-        """Takes a list of futures and returns a single future which resolves
-        to the respective list of outputs."""
-        result_future: Future[ModelRunnerOutput | None] = Future()
-
-        outputs: list[ModelRunnerOutput | None] = [None] * len(output_futures)
-        remaining = len(output_futures)
-
-        def make_callback(idx):
-            def callback(fut):
-                if result_future.done():
-                    return
-
-                try:
-                    outputs[idx] = fut.result()
-                except CancelledError:
-                    result_future.cancel()
-                except Exception as e:
-                    result_future.set_exception(e)
-
-                # this check assumes io_thread_pool uses a single thread
-                nonlocal remaining
-                remaining -= 1
-                if not remaining:
-                    result_future.set_result(self.aggregate(outputs, output_rank))
-
-            return callback
-
-        for i, output_future in enumerate(output_futures):
-            output_future.add_done_callback(make_callback(i))
-
-        return result_future
-
 
 def _make_src_and_dst_indices(
     src_block_ids: list[int],
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index cb9f208a839f2..354aa9a87183d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -58,6 +58,7 @@ if TYPE_CHECKING:
     )
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 # s_tensor_list, d_tensor_list, s_indices, d_indices, direction
@@ -141,7 +142,12 @@ class KVConnectorMetadata(ABC):  # noqa: B024
 
 
 class KVConnectorBase_V1(ABC):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
         logger.warning(
             "Initializing KVConnectorBase_V1. This API is experimental and "
             "subject to change in the future as we iterate the design."
@@ -152,6 +158,14 @@ class KVConnectorBase_V1(ABC):
             self._kv_transfer_config = vllm_config.kv_transfer_config
         else:
             raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1")
+        self._kv_cache_config = kv_cache_config
+        if self._kv_cache_config is None:
+            logger.warning(
+                "KVConnectorBase_V1 initialized without kv_cache_config. "
+                "This is deprecated - please update your connector to accept "
+                "kv_cache_config as the third constructor argument and pass it "
+                "to super().__init__()."
+            )
         self._role = role
 
     @property
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index ca251cd0c6ebd..9cd7d93c92fa3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -32,7 +32,7 @@ Usage:
 """
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
@@ -50,6 +50,7 @@ if TYPE_CHECKING:
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -79,8 +80,13 @@ class DecodeBenchConnector(KVConnectorBase_V1):
     testing of the decoder with larger input sequence lengths.
     """
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config, role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
 
         self.connector_scheduler: DecodeBenchConnectorScheduler | None = None
         self.connector_worker: DecodeBenchConnectorWorker | None = None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 7232d947030cb..575ab468be566 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -20,14 +20,22 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
 class LMCacheConnectorV1(KVConnectorBase_V1):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
         assert vllm_config.kv_transfer_config is not None
         use_native = vllm_config.kv_transfer_config.get_from_extra_config(
             "use_native", False
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index ad907c75a244b..94572b02fa872 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -724,7 +724,7 @@ class LMCacheConnectorV1Impl:
                 "max_model_len": getattr(
                     vllm_config.model_config, "max_model_len", None
                 ),
-                "vocab_size": getattr(vllm_config.model_config, "vocab_size", None),
+                "vocab_size": vllm_config.model_config.get_vocab_size(),
                 "num_layers": getattr(
                     vllm_config.model_config, "get_num_layers", lambda _: None
                 )(vllm_config.parallel_config),
@@ -746,10 +746,6 @@ class LMCacheConnectorV1Impl:
                 "gpu_memory_utilization": getattr(
                     vllm_config.cache_config, "gpu_memory_utilization", None
                 ),
-                "swap_space": getattr(vllm_config.cache_config, "swap_space", None),
-                "enable_prefix_caching": getattr(
-                    vllm_config.cache_config, "enable_prefix_caching", None
-                ),
             },
         }
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index d56f30bd11e5b..d7bbf02c83677 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -109,15 +110,22 @@ class MultiConnector(KVConnectorBase_V1):
     - Save to all connectors.
     """
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
 
         self._connectors: list[KVConnectorBase_V1] = []
         self._ktc_kv_transfer_config = []
         for connector_cls, temp_config in self._get_connector_classes_and_configs(
             vllm_config
         ):
-            self._connectors.append(connector_cls(temp_config, role))
+            self._connectors.append(connector_cls(temp_config, role, kv_cache_config))
             self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
 
         # A mapping from request id to the index of the connector chosen to
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 4651cedbc7dfa..ff9770b72bd38 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -13,7 +13,7 @@ from collections import defaultdict
 from collections.abc import Iterator
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
 import numpy as np
@@ -52,6 +52,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 Transfer = tuple[int, float]  # (xfer_handle, start_time)
@@ -150,7 +151,14 @@ class NixlConnectorMetadata(KVConnectorMetadata):
 
 
 class NixlConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
         assert vllm_config.kv_transfer_config is not None
         assert vllm_config.kv_transfer_config.engine_id is not None
         self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 19344e5784c23..582e42cc466ae 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -21,6 +21,7 @@ from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import BlockHash
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import OffloadingManager
 from vllm.v1.kv_offload.factory import OffloadingSpecFactory
 from vllm.v1.kv_offload.mediums import GPULoadStoreSpec
@@ -41,8 +42,13 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
-        super().__init__(vllm_config, role)
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
 
         spec = OffloadingSpecFactory.create_spec(vllm_config)
 
@@ -274,8 +280,8 @@ class OffloadingConnectorScheduler:
             if num_new_blocks <= 0:
                 continue
 
-            num_gpu_blocks = num_blocks * self.block_size_factor
-            assert len(req.block_hashes) >= num_gpu_blocks
+            # NOTE: In async scheduling, placeholders may temporarily make
+            # len(req.block_hashes) < num_blocks * self.block_size_factor.
 
             new_block_hashes = self._get_block_hashes(
                 req, start_idx=start_block_idx, end_idx=num_blocks
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 780dd12fccda3..a124a0d519db8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import regex as re
 import torch
@@ -25,6 +25,7 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -71,8 +72,17 @@ class P2pNcclConnectorMetadata(KVConnectorMetadata):
 
 
 class P2pNcclConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config,
+            role=role,
+            kv_cache_config=kv_cache_config,
+        )
         self._block_size = vllm_config.cache_config.block_size
         self._requests_need_load: dict[str, Any] = {}
         self.is_producer = self._kv_transfer_config.is_kv_producer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 9c230d7d0d2f4..016d1d45b3593 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import safetensors
 import torch
@@ -22,6 +22,7 @@ if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -86,8 +87,17 @@ class SharedStorageConnector(KVConnectorBase_V1):
     # It does extra work which will overwrite the existing prefix-cache in GPU
     # - to remove the overhead, need to add some "mask" in the ReqMeta class
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config,
+            role=role,
+            kv_cache_config=kv_cache_config,
+        )
         self._block_size = vllm_config.cache_config.block_size
         self._requests_need_load: dict[str, Request] = {}
         self._storage_path = self._kv_transfer_config.get_from_extra_config(
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index cabfc10e7f942..54b46d98870a5 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
-from vllm import envs
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
@@ -12,6 +11,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.v1.kv_cache_interface import KVCacheConfig
 
 _KV_CONNECTOR_AGENT: KVConnectorBaseType | None = None
 
@@ -48,7 +48,9 @@ def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> boo
     return isinstance(connector, KVConnectorBase_V1)
 
 
-def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+def ensure_kv_transfer_initialized(
+    vllm_config: "VllmConfig", kv_cache_config: Optional["KVCacheConfig"] = None
+) -> None:
     """
     Initialize KV cache transfer parallel group.
     """
@@ -62,12 +64,11 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         vllm_config.kv_transfer_config.is_kv_transfer_instance
         and _KV_CONNECTOR_AGENT is None
     ):
-        if envs.VLLM_USE_V1:
-            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
-                config=vllm_config, role=KVConnectorRole.WORKER
-            )
-        else:
-            raise ValueError("V0 is no longer supported")
+        _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+            config=vllm_config,
+            role=KVConnectorRole.WORKER,
+            kv_cache_config=kv_cache_config,
+        )
 
 
 def ensure_kv_transfer_shutdown() -> None:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a9b01e82562b9..c78e6a32733c1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1483,6 +1483,9 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    # Ensure all objects are not freezed before cleanup
+    gc.unfreeze()
+
     destroy_model_parallel()
     destroy_distributed_environment()
     if shutdown_ray:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 66c75d944ec8b..b12b7082af627 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -80,13 +80,12 @@ from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
-from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     get_model_path,
     is_interleaved,
     maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_s3
+from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@@ -386,6 +385,7 @@ class EngineArgs:
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
+    dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: int | None = None
     data_parallel_start_rank: int | None = None
@@ -438,8 +438,6 @@ class EngineArgs:
     aggregate_engine_logging: bool = False
     revision: str | None = ModelConfig.revision
     code_revision: str | None = ModelConfig.code_revision
-    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
-    rope_theta: float | None = ModelConfig.rope_theta
     hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: str | None = ModelConfig.tokenizer_revision
@@ -497,7 +495,7 @@ class EngineArgs:
         VllmConfig, "structured_outputs_config"
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
-
+    reasoning_parser_plugin: str | None = None
     # Deprecated guided decoding fields
     guided_decoding_backend: str | None = None
     guided_decoding_disable_fallback: bool | None = None
@@ -516,7 +514,7 @@ class EngineArgs:
         ObservabilityConfig.collect_detailed_traces
     )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
-    scheduler_cls: str | type[object] = SchedulerConfig.scheduler_cls
+    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
     override_pooler_config: dict | PoolerConfig | None = (
@@ -555,7 +553,7 @@ class EngineArgs:
     )
     """Custom logitproc types"""
 
-    async_scheduling: bool = SchedulerConfig.async_scheduling
+    async_scheduling: bool | None = SchedulerConfig.async_scheduling
 
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
@@ -617,8 +615,6 @@ class EngineArgs:
         )
         model_group.add_argument("--revision", **model_kwargs["revision"])
         model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
-        model_group.add_argument("--rope-scaling", **model_kwargs["rope_scaling"])
-        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
         model_group.add_argument(
             "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
         )
@@ -711,10 +707,13 @@ class EngineArgs:
         )
         structured_outputs_group.add_argument(
             "--reasoning-parser",
-            # This choice is a special case because it's not static
-            choices=list(ReasoningParserManager.reasoning_parsers),
+            # Choices need to be validated after parsing to include plugins
             **structured_outputs_kwargs["reasoning_parser"],
         )
+        structured_outputs_group.add_argument(
+            "--reasoning-parser-plugin",
+            **structured_outputs_kwargs["reasoning_parser_plugin"],
+        )
         # Deprecated guided decoding arguments
         for arg, type in [
             ("--guided-decoding-backend", str),
@@ -752,6 +751,10 @@ class EngineArgs:
             "-dcp",
             **parallel_kwargs["decode_context_parallel_size"],
         )
+        parallel_group.add_argument(
+            "--dcp-kv-cache-interleave-size",
+            **parallel_kwargs["dcp_kv_cache_interleave_size"],
+        )
         parallel_group.add_argument(
             "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
         )
@@ -1184,8 +1187,6 @@ class EngineArgs:
             seed=self.seed,
             revision=self.revision,
             code_revision=self.code_revision,
-            rope_scaling=self.rope_scaling,
-            rope_theta=self.rope_theta,
             hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
@@ -1294,15 +1295,7 @@ class EngineArgs:
         """
         Create the VllmConfig.
 
-        NOTE: for autoselection of V0 vs V1 engine, we need to
-        create the ModelConfig first, since ModelConfig's attrs
-        (e.g. the model arch) are needed to make the decision.
-
-        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
-        unspecified by the user.
-
-        If VLLM_USE_V1 is specified by the user but the VllmConfig
-        is incompatible, we raise an error.
+        NOTE: If VllmConfig is incompatible, we raise an error.
         """
         current_platform.pre_register_and_update()
 
@@ -1310,10 +1303,10 @@ class EngineArgs:
 
         # Check if the model is a speculator and override model/tokenizer/config
         # BEFORE creating ModelConfig, so the config is created with the target model
-        # Skip speculator detection for S3 models since HuggingFace cannot load
-        # configs directly from S3 URLs. S3 models can still use speculators with
-        # explicit --speculative-config.
-        if not is_s3(self.model):
+        # Skip speculator detection for cloud storage models (eg: S3, GCS) since
+        # HuggingFace cannot load configs directly from S3 URLs. S3 models can still
+        # use speculators with explicit --speculative-config.
+        if not is_cloud_storage(self.model):
             (self.model, self.tokenizer, self.speculative_config) = (
                 maybe_override_with_speculators(
                     model=self.model,
@@ -1328,22 +1321,7 @@ class EngineArgs:
         self.model = model_config.model
         self.tokenizer = model_config.tokenizer
 
-        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
-        #   and fall back to V0 for experimental or unsupported features.
-        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
-        #   features and raise error for unsupported features.
-        # * If VLLM_USE_V1=0, we disable V1.
-        use_v1 = False
-        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
-        if try_v1 and self._is_v1_supported_oracle(model_config):
-            use_v1 = True
-
-        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
-        if envs.is_set("VLLM_USE_V1"):
-            assert use_v1 == envs.VLLM_USE_V1
-        # Otherwise, set the VLLM_USE_V1 variable globally.
-        else:
-            envs.set_vllm_use_v1(use_v1)
+        self._check_feature_supported(model_config)
 
         # Set default arguments for V1 Engine.
         self._set_default_args(usage_context, model_config)
@@ -1506,20 +1484,6 @@ class EngineArgs:
             else ParallelConfig.data_parallel_rpc_port
         )
 
-        if self.async_scheduling:
-            if self.pipeline_parallel_size > 1:
-                raise ValueError(
-                    "Async scheduling is not supported with pipeline-parallel-size > 1."
-                )
-
-            # Currently, async scheduling does not support speculative decoding.
-            # TODO(woosuk): Support it.
-            if self.speculative_config is not None:
-                raise ValueError(
-                    "Currently, speculative decoding is not supported with "
-                    "async scheduling."
-                )
-
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts
@@ -1559,20 +1523,11 @@ class EngineArgs:
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
             decode_context_parallel_size=self.decode_context_parallel_size,
+            dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
             _api_process_count=self._api_process_count,
             _api_process_rank=self._api_process_rank,
         )
 
-        if self.async_scheduling and (
-            parallel_config.distributed_executor_backend
-            not in ("mp", "uni", "external_launcher")
-        ):
-            raise ValueError(
-                "Currently, async scheduling only supports `mp`, `uni` or "
-                "`external_launcher` distributed executor backend, but you choose "
-                f"`{parallel_config.distributed_executor_backend}`."
-            )
-
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
@@ -1625,6 +1580,20 @@ class EngineArgs:
             else None
         )
 
+        if (
+            lora_config is not None
+            and speculative_config is not None
+            and scheduler_config.max_num_batched_tokens
+            < (
+                scheduler_config.max_num_seqs
+                * (speculative_config.num_speculative_tokens + 1)
+            )
+        ):
+            raise ValueError(
+                "Consider increasing max_num_batched_tokens or "
+                "decreasing num_speculative_tokens"
+            )
+
         # bitsandbytes pre-quantized model need a specific model loader
         if model_config.quantization == "bitsandbytes":
             self.quantization = self.load_format = "bitsandbytes"
@@ -1635,6 +1604,11 @@ class EngineArgs:
         if self.reasoning_parser:
             self.structured_outputs_config.reasoning_parser = self.reasoning_parser
 
+        if self.reasoning_parser_plugin:
+            self.structured_outputs_config.reasoning_parser_plugin = (
+                self.reasoning_parser_plugin
+            )
+
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:
@@ -1707,17 +1681,10 @@ class EngineArgs:
 
         return config
 
-    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
-        """Oracle for whether to use V0 or V1 Engine by default."""
-
-        #############################################################
-        # Unsupported Feature Flags on V1.
-
+    def _check_feature_supported(self, model_config: ModelConfig):
+        """Raise an error if the feature is not supported."""
         if self.logits_processor_pattern != EngineArgs.logits_processor_pattern:
-            _raise_or_fallback(
-                feature_name="--logits-processor-pattern", recommend_to_remove=False
-            )
-            return False
+            _raise_unsupported_error(feature_name="--logits-processor-pattern")
 
         # No Concurrent Partial Prefills so far.
         if (
@@ -1725,12 +1692,9 @@ class EngineArgs:
             or self.max_long_partial_prefills
             != SchedulerConfig.max_long_partial_prefills
         ):
-            _raise_or_fallback(
-                feature_name="Concurrent Partial Prefill", recommend_to_remove=False
-            )
-            return False
+            _raise_unsupported_error(feature_name="Concurrent Partial Prefill")
 
-        # V1 supports N-gram, Medusa, and Eagle speculative decoding.
+        # N-gram, Medusa, and Eagle are supported for speculative decoding.
         if self.speculative_config is not None:
             # speculative_config could still be a dict at this point
             if isinstance(self.speculative_config, dict):
@@ -1745,35 +1709,6 @@ class EngineArgs:
                     "such as ngram, medusa, eagle, or mtp."
                 )
 
-        V1_BACKENDS = [
-            "FLASH_ATTN",
-            "PALLAS",
-            "TRITON_ATTN",
-            "TRITON_MLA",
-            "CUTLASS_MLA",
-            "FLASHMLA",
-            "FLASH_ATTN_MLA",
-            "FLASHINFER",
-            "FLASHINFER_MLA",
-            "ROCM_AITER_MLA",
-            "TORCH_SDPA",
-            "FLEX_ATTENTION",
-            "TREE_ATTN",
-            "XFORMERS",
-            "ROCM_ATTN",
-            "ROCM_AITER_UNIFIED_ATTN",
-        ]
-        if (
-            envs.is_set("VLLM_ATTENTION_BACKEND")
-            and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS
-        ):
-            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
-            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
-            return False
-
-        #############################################################
-        # Experimental Features - allow users to opt in.
-
         if self.pipeline_parallel_size > 1:
             supports_pp = getattr(
                 self.distributed_executor_backend, "supports_pp", False
@@ -1789,18 +1724,10 @@ class EngineArgs:
                     "executor or multiprocessing executor or external "
                     "launcher"
                 )
-                _raise_or_fallback(feature_name=name, recommend_to_remove=False)
-                return False
+                _raise_unsupported_error(feature_name=name)
 
         if current_platform.is_cpu() and model_config.get_sliding_window() is not None:
-            _raise_or_fallback(
-                feature_name="sliding window (CPU backend)", recommend_to_remove=False
-            )
-            return False
-
-        #############################################################
-
-        return True
+            _raise_unsupported_error(feature_name="sliding window (CPU backend)")
 
     def _set_default_args(
         self, usage_context: UsageContext, model_config: ModelConfig
@@ -1999,17 +1926,12 @@ class AsyncEngineArgs(EngineArgs):
         return parser
 
 
-def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
-    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-        raise NotImplementedError(
-            f"VLLM_USE_V1=1 is not supported with {feature_name}."
-        )
-    msg = f"{feature_name} is not supported by the V1 Engine. "
-    msg += "Falling back to V0. "
-    if recommend_to_remove:
-        msg += f"We recommend to remove {feature_name} from your config "
-        msg += "in favor of the V1 Engine."
-    logger.warning(msg)
+def _raise_unsupported_error(feature_name: str):
+    msg = (
+        f"{feature_name} is not supported. We recommend to "
+        f"remove {feature_name} from your config."
+    )
+    raise NotImplementedError(msg)
 
 
 def human_readable_int(value):
diff --git a/vllm/entrypoints/anthropic/api_server.py b/vllm/entrypoints/anthropic/api_server.py
deleted file mode 100644
index df877f99b084f..0000000000000
--- a/vllm/entrypoints/anthropic/api_server.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from:
-# https://github.com/vllm/vllm/entrypoints/openai/api_server.py
-
-import asyncio
-import signal
-import tempfile
-from argparse import Namespace
-from http import HTTPStatus
-
-import uvloop
-from fastapi import APIRouter, Depends, FastAPI, Request
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response, StreamingResponse
-from starlette.datastructures import State
-
-import vllm.envs as envs
-from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.anthropic.protocol import (
-    AnthropicErrorResponse,
-    AnthropicMessagesRequest,
-    AnthropicMessagesResponse,
-)
-from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
-from vllm.entrypoints.launcher import serve_http
-from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client,
-    create_server_socket,
-    lifespan,
-    load_log_config,
-    validate_api_server_args,
-    validate_json_request,
-)
-from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
-from vllm.entrypoints.openai.protocol import ErrorResponse
-from vllm.entrypoints.openai.serving_models import (
-    BaseModelPath,
-    OpenAIServingModels,
-)
-
-#
-# yapf: enable
-from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.utils import (
-    cli_env_setup,
-    load_aware_call,
-    process_chat_template,
-    process_lora_modules,
-    with_cancellation,
-)
-from vllm.logger import init_logger
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.network_utils import is_valid_ipv6_address
-from vllm.utils.system_utils import set_ulimit
-from vllm.version import __version__ as VLLM_VERSION
-
-prometheus_multiproc_dir: tempfile.TemporaryDirectory
-
-# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
-logger = init_logger("vllm.entrypoints.anthropic.api_server")
-
-_running_tasks: set[asyncio.Task] = set()
-
-router = APIRouter()
-
-
-def messages(request: Request) -> AnthropicServingMessages:
-    return request.app.state.anthropic_serving_messages
-
-
-def engine_client(request: Request) -> EngineClient:
-    return request.app.state.engine_client
-
-
-@router.get("/health", response_class=Response)
-async def health(raw_request: Request) -> Response:
-    """Health check."""
-    await engine_client(raw_request).check_health()
-    return Response(status_code=200)
-
-
-@router.get("/ping", response_class=Response)
-@router.post("/ping", response_class=Response)
-async def ping(raw_request: Request) -> Response:
-    """Ping check. Endpoint required for SageMaker"""
-    return await health(raw_request)
-
-
-@router.post(
-    "/v1/messages",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
-    handler = messages(raw_request)
-    if handler is None:
-        return messages(raw_request).create_error_response(
-            message="The model does not support Messages API"
-        )
-
-    generator = await handler.create_messages(request, raw_request)
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    elif isinstance(generator, AnthropicMessagesResponse):
-        logger.debug(
-            "Anthropic Messages Response: %s", generator.model_dump(exclude_none=True)
-        )
-        return JSONResponse(content=generator.model_dump(exclude_none=True))
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-async def init_app_state(
-    engine_client: EngineClient,
-    state: State,
-    args: Namespace,
-) -> None:
-    vllm_config = engine_client.vllm_config
-
-    if args.served_model_name is not None:
-        served_model_names = args.served_model_name
-    else:
-        served_model_names = [args.model]
-
-    if args.disable_log_requests:
-        request_logger = None
-    else:
-        request_logger = RequestLogger(max_log_len=args.max_log_len)
-
-    base_model_paths = [
-        BaseModelPath(name=name, model_path=args.model) for name in served_model_names
-    ]
-
-    state.engine_client = engine_client
-    state.log_stats = not args.disable_log_stats
-    state.vllm_config = vllm_config
-    model_config = vllm_config.model_config
-
-    default_mm_loras = (
-        vllm_config.lora_config.default_mm_loras
-        if vllm_config.lora_config is not None
-        else {}
-    )
-    lora_modules = process_lora_modules(args.lora_modules, default_mm_loras)
-
-    resolved_chat_template = await process_chat_template(
-        args.chat_template, engine_client, model_config
-    )
-
-    state.openai_serving_models = OpenAIServingModels(
-        engine_client=engine_client,
-        base_model_paths=base_model_paths,
-        lora_modules=lora_modules,
-    )
-    await state.openai_serving_models.init_static_loras()
-    state.anthropic_serving_messages = AnthropicServingMessages(
-        engine_client,
-        state.openai_serving_models,
-        args.response_role,
-        request_logger=request_logger,
-        chat_template=resolved_chat_template,
-        chat_template_content_format=args.chat_template_content_format,
-        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-        enable_auto_tools=args.enable_auto_tool_choice,
-        tool_parser=args.tool_call_parser,
-        reasoning_parser=args.reasoning_parser,
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-        enable_force_include_usage=args.enable_force_include_usage,
-    )
-
-
-def setup_server(args):
-    """Validate API server args, set up signal handler, create socket
-    ready to serve."""
-
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-
-    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
-        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
-
-    validate_api_server_args(args)
-
-    # workaround to make sure that we bind the port before the engine is set up.
-    # This avoids race conditions with ray.
-    # see https://github.com/vllm-project/vllm/issues/8204
-    sock_addr = (args.host or "", args.port)
-    sock = create_server_socket(sock_addr)
-
-    # workaround to avoid footguns where uvicorn drops requests with too
-    # many concurrent requests active
-    set_ulimit()
-
-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm while initializing
-        raise KeyboardInterrupt("terminated")
-
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    addr, port = sock_addr
-    is_ssl = args.ssl_keyfile and args.ssl_certfile
-    host_part = f"[{addr}]" if is_valid_ipv6_address(addr) else addr or "0.0.0.0"
-    listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
-
-    return listen_address, sock
-
-
-async def run_server(args, **uvicorn_kwargs) -> None:
-    """Run a single-worker API server."""
-    listen_address, sock = setup_server(args)
-    await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
-
-
-def build_app(args: Namespace) -> FastAPI:
-    app = FastAPI(lifespan=lifespan)
-    app.include_router(router)
-    app.root_path = args.root_path
-
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=args.allowed_origins,
-        allow_credentials=args.allow_credentials,
-        allow_methods=args.allowed_methods,
-        allow_headers=args.allowed_headers,
-    )
-
-    return app
-
-
-async def run_server_worker(
-    listen_address, sock, args, client_config=None, **uvicorn_kwargs
-) -> None:
-    """Run a single API server worker."""
-
-    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
-        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
-
-    server_index = client_config.get("client_index", 0) if client_config else 0
-
-    # Load logging config for uvicorn if specified
-    log_config = load_log_config(args.log_config_file)
-    if log_config is not None:
-        uvicorn_kwargs["log_config"] = log_config
-
-    async with build_async_engine_client(
-        args,
-        client_config=client_config,
-    ) as engine_client:
-        app = build_app(args)
-
-        await init_app_state(engine_client, app.state, args)
-
-        logger.info("Starting vLLM API server %d on %s", server_index, listen_address)
-        shutdown_task = await serve_http(
-            app,
-            sock=sock,
-            enable_ssl_refresh=args.enable_ssl_refresh,
-            host=args.host,
-            port=args.port,
-            log_level=args.uvicorn_log_level,
-            # NOTE: When the 'disable_uvicorn_access_log' value is True,
-            # no access log will be output.
-            access_log=not args.disable_uvicorn_access_log,
-            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
-            ssl_keyfile=args.ssl_keyfile,
-            ssl_certfile=args.ssl_certfile,
-            ssl_ca_certs=args.ssl_ca_certs,
-            ssl_cert_reqs=args.ssl_cert_reqs,
-            **uvicorn_kwargs,
-        )
-
-    # NB: Await server shutdown only after the backend context is exited
-    try:
-        await shutdown_task
-    finally:
-        sock.close()
-
-
-if __name__ == "__main__":
-    # NOTE(simon):
-    # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
-    # entrypoints.
-    cli_env_setup()
-    parser = FlexibleArgumentParser(
-        description="vLLM Anthropic-Compatible RESTful API server."
-    )
-    parser = make_arg_parser(parser)
-    args = parser.parse_args()
-    validate_parsed_serve_args(args)
-
-    uvloop.run(run_server(args))
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
index 11c96adf332f5..340dabf0e7117 100644
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -231,9 +231,11 @@ class AnthropicServingMessages(OpenAIServingChat):
         See https://docs.anthropic.com/en/api/messages
         for the API specification. This API mimics the Anthropic messages API.
         """
-        logger.debug("Received messages request %s", request.model_dump_json())
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("Received messages request %s", request.model_dump_json())
         chat_req = self._convert_anthropic_to_openai_request(request)
-        logger.debug("Convert to OpenAI request %s", request.model_dump_json())
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("Convert to OpenAI request %s", chat_req.model_dump_json())
         generator = await self.create_chat_completion(chat_req, raw_request)
 
         if isinstance(generator, ErrorResponse):
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 09641aaff3066..d7d6419d643b0 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -43,11 +43,12 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
 # pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypedDict
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
-from vllm.multimodal.utils import MediaConnector
+from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -806,7 +807,9 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
+
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
@@ -891,7 +894,8 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 99a8759c84f49..fb49be370203e 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
     def cmd(args: argparse.Namespace) -> None:
         model_name, client = _interactive_cli(args)
 
+        kwargs = {
+            "model": model_name,
+            "stream": True,
+        }
+        if args.max_tokens:
+            kwargs["max_tokens"] = args.max_tokens
+
         if args.quick:
-            stream = client.completions.create(
-                model=model_name, prompt=args.quick, stream=True
-            )
+            stream = client.completions.create(prompt=args.quick, **kwargs)
             _print_completion_stream(stream)
             return
 
@@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
                 input_prompt = input("> ")
             except EOFError:
                 break
-            stream = client.completions.create(
-                model=model_name, prompt=input_prompt, stream=True
-            )
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
             _print_completion_stream(stream)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Add CLI arguments for the complete command."""
         _add_query_options(parser)
+        parser.add_argument(
+            "--max-tokens",
+            type=int,
+            help="Maximum number of tokens to generate per output sequence.",
+        )
         parser.add_argument(
             "-q",
             "--quick",
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index dc6f3df5a68ec..2678658dd1262 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -88,9 +88,6 @@ def run_headless(args: argparse.Namespace):
         usage_context=usage_context, headless=True
     )
 
-    if not envs.VLLM_USE_V1:
-        raise ValueError("Headless mode is only supported for V1")
-
     if engine_args.data_parallel_hybrid_lb:
         raise ValueError("data_parallel_hybrid_lb is not applicable in headless mode")
 
@@ -156,15 +153,10 @@ def run_multi_api_server(args: argparse.Namespace):
     usage_context = UsageContext.OPENAI_API_SERVER
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
-    if num_api_servers > 1:
-        if not envs.VLLM_USE_V1:
-            raise ValueError("api_server_count > 1 is only supported for V1")
-
-        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-            raise ValueError(
-                "VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
-                "with api_server_count > 1"
-            )
+    if num_api_servers > 1 and envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        raise ValueError(
+            "VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1"
+        )
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/dynamic_lora.py
new file mode 100644
index 0000000000000..cc0f437e5c77f
--- /dev/null
+++ b/vllm/entrypoints/dynamic_lora.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import models, validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def register_dynamic_lora_routes(router: APIRouter):
+    @sagemaker_standards.register_load_adapter_handler(
+        request_shape={
+            "lora_name": "body.name",
+            "lora_path": "body.src",
+        },
+    )
+    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
+    async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    @sagemaker_standards.register_unload_adapter_handler(
+        request_shape={
+            "lora_name": "path_params.adapter_name",
+        }
+    )
+    @router.post(
+        "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
+    )
+    async def unload_lora_adapter(
+        request: UnloadLoRAAdapterRequest, raw_request: Request
+    ):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    return router
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 7958d0317739a..47a252348c102 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -521,15 +521,15 @@ def parse_chat_output(
     is_tool_call = False  # TODO: update this when tool call is supported
     if len(output_msgs) == 0:
         # The generation has stopped during reasoning.
-        reasoning_content = parser.current_content
+        reasoning = parser.current_content
         final_content = None
     elif len(output_msgs) == 1:
         # The generation has stopped during final message.
-        reasoning_content = output_msgs[0].content[0].text
+        reasoning = output_msgs[0].content[0].text
         final_content = parser.current_content
     else:
         reasoning_msg = output_msgs[:-1]
         final_msg = output_msgs[-1]
-        reasoning_content = "\n".join([msg.content[0].text for msg in reasoning_msg])
+        reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
         final_content = final_msg.content[0].text
-    return reasoning_content, final_content, is_tool_call
+    return reasoning, final_content, is_tool_call
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 758e16c89e694..22fe2ae9280aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -67,6 +67,7 @@ from vllm.outputs import (
     RequestOutput,
     ScoringRequestOutput,
 )
+from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
@@ -289,7 +290,11 @@ class LLM:
         # warn about single-process data parallel usage.
         _dp_size = int(kwargs.get("data_parallel_size", 1))
         _distributed_executor_backend = kwargs.get("distributed_executor_backend")
-        if _dp_size > 1 and not _distributed_executor_backend == "external_launcher":
+        if (
+            _dp_size > 1
+            and not _distributed_executor_backend == "external_launcher"
+            and not current_platform.is_tpu()
+        ):
             raise ValueError(
                 f"LLM(data_parallel_size={_dp_size}) is not supported for single-"
                 "process usage and may hang. Please use "
@@ -1583,20 +1588,27 @@ class LLM:
             tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
             it = tqdm_func(it, desc="Adding requests")
 
-        for i, prompt in enumerate(it):
-            if isinstance(prompt, dict):
-                self._validate_mm_data_and_uuids(
-                    prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids")
-                )
+        added_request_ids: list[str] = []
 
-            self._add_request(
-                prompt,
-                params[i] if isinstance(params, Sequence) else params,
-                lora_request=lora_request[i]
-                if isinstance(lora_request, Sequence)
-                else lora_request,
-                priority=priority[i] if priority else 0,
-            )
+        try:
+            for i, prompt in enumerate(it):
+                if isinstance(prompt, dict):
+                    self._validate_mm_data_and_uuids(
+                        prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids")
+                    )
+                request_id = self._add_request(
+                    prompt,
+                    params[i] if isinstance(params, Sequence) else params,
+                    lora_request=lora_request[i]
+                    if isinstance(lora_request, Sequence)
+                    else lora_request,
+                    priority=priority[i] if priority else 0,
+                )
+                added_request_ids.append(request_id)
+        except Exception as e:
+            if added_request_ids:
+                self.llm_engine.abort_request(added_request_ids)
+            raise e
 
     def _validate_mm_data_and_uuids(
         self,
@@ -1679,7 +1691,7 @@ class LLM:
         params: SamplingParams | PoolingParams,
         lora_request: LoRARequest | None = None,
         priority: int = 0,
-    ) -> None:
+    ) -> str:
         prompt_text, _, _ = get_prompt_components(prompt)
         request_id = str(next(self.request_counter))
 
@@ -1700,6 +1712,7 @@ class LLM:
             priority=priority,
             prompt_text=prompt_text,
         )
+        return request_id
 
     def _run_engine(
         self, *, use_tqdm: bool | Callable[..., tqdm] = True
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8fa71855f8f66..fbb2d32a229da 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import asyncio
-import gc
 import hashlib
 import importlib
 import inspect
@@ -21,6 +19,7 @@ from contextlib import asynccontextmanager
 from http import HTTPStatus
 from typing import Annotated, Any, Literal
 
+import model_hosting_container_standards.sagemaker as sagemaker_standards
 import prometheus_client
 import pydantic
 import regex as re
@@ -41,9 +40,17 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import Device, EngineClient
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicError,
+    AnthropicErrorResponse,
+    AnthropicMessagesRequest,
+    AnthropicMessagesResponse,
+)
+from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
+from vllm.entrypoints.openai.orca_metrics import metrics_header
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -59,7 +66,6 @@ from vllm.entrypoints.openai.protocol import (
     ErrorInfo,
     ErrorResponse,
     IOProcessorResponse,
-    LoadLoRAAdapterRequest,
     PoolingBytesResponse,
     PoolingRequest,
     PoolingResponse,
@@ -76,7 +82,6 @@ from vllm.entrypoints.openai.protocol import (
     TranscriptionResponse,
     TranslationRequest,
     TranslationResponse,
-    UnloadLoRAAdapterRequest,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_classification import ServingClassification
@@ -110,6 +115,7 @@ from vllm.reasoning import ReasoningParserManager
 from vllm.tasks import POOLING_TASKS
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
 from vllm.v1.engine.exceptions import EngineDeadError
@@ -121,6 +127,8 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
 logger = init_logger("vllm.entrypoints.openai.api_server")
 
+ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
+
 _running_tasks: set[asyncio.Task] = set()
 
 
@@ -143,8 +151,7 @@ async def lifespan(app: FastAPI):
 
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
+        freeze_gc_heap()
         try:
             yield
         finally:
@@ -210,14 +217,8 @@ async def build_async_engine_client_from_engine_args(
     # Create the EngineConfig (determines if we can use V1).
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
-    # V1 AsyncLLM.
-    assert envs.VLLM_USE_V1
-
     if disable_frontend_multiprocessing:
-        logger.warning(
-            "V1 is enabled, but got --disable-frontend-multiprocessing. "
-            "To disable frontend multiprocessing, set VLLM_USE_V1=0."
-        )
+        logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.")
 
     from vllm.v1.engine.async_llm import AsyncLLM
 
@@ -308,6 +309,10 @@ def responses(request: Request) -> OpenAIServingResponses | None:
     return request.app.state.openai_serving_responses
 
 
+def messages(request: Request) -> AnthropicServingMessages:
+    return request.app.state.anthropic_serving_messages
+
+
 def chat(request: Request) -> OpenAIServingChat | None:
     return request.app.state.openai_serving_chat
 
@@ -381,13 +386,6 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
-@router.get("/ping", response_class=Response)
-@router.post("/ping", response_class=Response)
-async def ping(raw_request: Request) -> Response:
-    """Ping check. Endpoint required for SageMaker"""
-    return await health(raw_request)
-
-
 @router.post(
     "/tokenize",
     dependencies=[Depends(validate_json_request)],
@@ -591,6 +589,62 @@ async def cancel_responses(response_id: str, raw_request: Request):
     return JSONResponse(content=response.model_dump())
 
 
+@router.post(
+    "/v1/messages",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
+    def translate_error_response(response: ErrorResponse) -> JSONResponse:
+        anthropic_error = AnthropicErrorResponse(
+            error=AnthropicError(
+                type=response.error.type,
+                message=response.error.message,
+            )
+        )
+        return JSONResponse(
+            status_code=response.error.code, content=anthropic_error.model_dump()
+        )
+
+    handler = messages(raw_request)
+    if handler is None:
+        error = base(raw_request).create_error_response(
+            message="The model does not support Messages API"
+        )
+        return translate_error_response(error)
+
+    try:
+        generator = await handler.create_messages(request, raw_request)
+    except Exception as e:
+        logger.exception("Error in create_messages: %s", e)
+        return JSONResponse(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+            content=AnthropicErrorResponse(
+                error=AnthropicError(
+                    type="internal_error",
+                    message=str(e),
+                )
+            ).model_dump(),
+        )
+
+    if isinstance(generator, ErrorResponse):
+        return translate_error_response(generator)
+
+    elif isinstance(generator, AnthropicMessagesResponse):
+        resp = generator.model_dump(exclude_none=True)
+        logger.debug("Anthropic Messages Response: %s", resp)
+        return JSONResponse(content=resp)
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
 @router.post(
     "/v1/chat/completions",
     dependencies=[Depends(validate_json_request)],
@@ -604,6 +658,9 @@ async def cancel_responses(response_id: str, raw_request: Request):
 @with_cancellation
 @load_aware_call
 async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
     handler = chat(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
@@ -621,7 +678,10 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
         )
 
     elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(content=generator.model_dump())
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
@@ -639,6 +699,9 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
 @with_cancellation
 @load_aware_call
 async def create_completion(request: CompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
     handler = completion(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
@@ -661,7 +724,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
             content=generator.model_dump(), status_code=generator.error.code
         )
     elif isinstance(generator, CompletionResponse):
-        return JSONResponse(content=generator.model_dump())
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
 
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
@@ -1162,52 +1228,17 @@ INVOCATION_VALIDATORS = [
 ]
 
 
-@router.post(
-    "/invocations",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-async def invocations(raw_request: Request):
-    """For SageMaker, routes requests based on the request type."""
-    try:
-        body = await raw_request.json()
-    except json.JSONDecodeError as e:
-        raise HTTPException(
-            status_code=HTTPStatus.BAD_REQUEST.value, detail=f"JSON decode error: {e}"
-        ) from e
-
-    valid_endpoints = [
-        (validator, endpoint)
-        for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
-        if get_handler(raw_request) is not None
-    ]
-
-    for request_validator, endpoint in valid_endpoints:
-        try:
-            request = request_validator.validate_python(body)
-        except pydantic.ValidationError:
-            continue
-
-        return await endpoint(request, raw_request)
-
-    type_names = [
-        t.__name__ if isinstance(t := validator._type, type) else str(t)
-        for validator, _ in valid_endpoints
-    ]
-    msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
-    res = base(raw_request).create_error_response(message=msg)
-    return JSONResponse(content=res.model_dump(), status_code=res.error.code)
-
-
 if envs.VLLM_TORCH_PROFILER_DIR:
-    logger.warning(
+    logger.warning_once(
         "Torch Profiler is enabled in the API server. This should ONLY be "
         "used for local development!"
     )
+elif envs.VLLM_TORCH_CUDA_PROFILE:
+    logger.warning_once(
+        "CUDA Profiler is enabled in the API server. This should ONLY be "
+        "used for local development!"
+    )
+if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
 
     @router.post("/start_profile")
     async def start_profile(raw_request: Request):
@@ -1224,39 +1255,6 @@ if envs.VLLM_TORCH_PROFILER_DIR:
         return Response(status_code=200)
 
 
-if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-    logger.warning(
-        "LoRA dynamic loading & unloading is enabled in the API server. "
-        "This should ONLY be used for local development!"
-    )
-
-    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
-    async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
-        handler = models(raw_request)
-        response = await handler.load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(
-                content=response.model_dump(), status_code=response.error.code
-            )
-
-        return Response(status_code=200, content=response)
-
-    @router.post(
-        "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
-    )
-    async def unload_lora_adapter(
-        request: UnloadLoRAAdapterRequest, raw_request: Request
-    ):
-        handler = models(raw_request)
-        response = await handler.unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(
-                content=response.model_dump(), status_code=response.error.code
-            )
-
-        return Response(status_code=200, content=response)
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
@@ -1495,8 +1493,7 @@ def _log_streaming_response(response, response_body: list) -> None:
                             full_content = full_content[:2048] + ""
                             "...[truncated]"
                         logger.info(
-                            "response_body={streaming_complete: "
-                            "content='%s', chunks=%d}",
+                            "response_body={streaming_complete: content=%r, chunks=%d}",
                             full_content,
                             chunk_count,
                         )
@@ -1527,6 +1524,20 @@ def build_app(args: Namespace) -> FastAPI:
         )
     else:
         app = FastAPI(lifespan=lifespan)
+
+    if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        logger.warning(
+            "LoRA dynamic loading & unloading is enabled in the API server. "
+            "This should ONLY be used for local development!"
+        )
+        from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes
+
+        register_dynamic_lora_routes(router)
+
+    from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
+
+    register_sagemaker_routes(router)
+
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -1617,6 +1628,8 @@ def build_app(args: Namespace) -> FastAPI:
                 f"Invalid middleware {middleware}. Must be a function or a class."
             )
 
+    app = sagemaker_standards.bootstrap(app)
+
     return app
 
 
@@ -1817,6 +1830,24 @@ async def init_app_state(
         if "transcription" in supported_tasks
         else None
     )
+    state.anthropic_serving_messages = (
+        AnthropicServingMessages(
+            engine_client,
+            state.openai_serving_models,
+            args.response_role,
+            request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            enable_auto_tools=args.enable_auto_tool_choice,
+            tool_parser=args.tool_call_parser,
+            reasoning_parser=args.structured_outputs_config.reasoning_parser,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
@@ -1842,20 +1873,20 @@ def create_server_unix_socket(path: str) -> socket.socket:
 
 
 def validate_api_server_args(args):
-    valid_tool_parses = ToolParserManager.tool_parsers.keys()
+    valid_tool_parses = ToolParserManager.list_registered()
     if args.enable_auto_tool_choice and args.tool_call_parser not in valid_tool_parses:
         raise KeyError(
             f"invalid tool call parser: {args.tool_call_parser} "
             f"(chose from {{ {','.join(valid_tool_parses)} }})"
         )
 
-    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    valid_reasoning_parsers = ReasoningParserManager.list_registered()
     if (
         reasoning_parser := args.structured_outputs_config.reasoning_parser
-    ) and reasoning_parser not in valid_reasoning_parses:
+    ) and reasoning_parser not in valid_reasoning_parsers:
         raise KeyError(
             f"invalid reasoning parser: {reasoning_parser} "
-            f"(chose from {{ {','.join(valid_reasoning_parses)} }})"
+            f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
         )
 
 
@@ -1869,6 +1900,9 @@ def setup_server(args):
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
+    if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
+        ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
+
     validate_api_server_args(args)
 
     # workaround to make sure that we bind the port before the engine is set up.
@@ -1918,6 +1952,9 @@ async def run_server_worker(
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
+    if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
+        ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
+
     # Load logging config for uvicorn if specified
     log_config = load_log_config(args.log_config_file)
     if log_config is not None:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 1a775d3d68094..476587c178237 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -219,7 +219,7 @@ class FrontendArgs:
         frontend_kwargs["middleware"]["default"] = []
 
         # Special case: Tool call parser shows built-in options.
-        valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
+        valid_tool_parsers = list(ToolParserManager.list_registered())
         parsers_str = ",".join(valid_tool_parsers)
         frontend_kwargs["tool_call_parser"]["metavar"] = (
             f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
diff --git a/vllm/entrypoints/openai/orca_metrics.py b/vllm/entrypoints/openai/orca_metrics.py
new file mode 100644
index 0000000000000..3808262bf31f2
--- /dev/null
+++ b/vllm/entrypoints/openai/orca_metrics.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Utility functions that create ORCA endpoint load report response headers.
+"""
+
+import json
+from collections.abc import Mapping
+
+from vllm.logger import init_logger
+from vllm.v1.metrics.reader import Gauge, get_metrics_snapshot
+
+logger = init_logger(__name__)
+
+
+def create_orca_header(
+    metrics_format: str, named_metrics: list[tuple[str, float]]
+) -> Mapping[str, str] | None:
+    """
+    Creates ORCA headers named 'endpoint-load-metrics' in the specified format
+    and adds custom metrics to named_metrics.
+    ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
+    ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
+
+    Parameters:
+    - metrics_format (str): The format of the header ('TEXT', 'JSON').
+    - named_metrics (List[Tuple[str, float]]): List of tuples with metric names
+    and their corresponding double values.
+
+    Returns:
+    - Optional[Mapping[str,str]]: A dictionary with header key as
+    'endpoint-load-metrics' and values as the ORCA header strings with
+    format prefix and data in  with named_metrics in.
+    """
+
+    if metrics_format.lower() not in ["text", "json"]:
+        logger.warning(
+            "Warning: `%s` format is not supported in the ORCA response header",
+            format,
+        )
+        return None
+
+    header = {}
+    orca_report = {
+        "named_metrics": {
+            metric_name: value
+            for metric_name, value in named_metrics
+            if isinstance(metric_name, str) and isinstance(value, float)
+        }
+    }
+    # output example:
+    # endpoint-load-metrics: TEXT named_metrics.kv_cache_utilization=0.4
+    if metrics_format.lower() == "text":
+        native_http_header = ", ".join(
+            [
+                f"named_metrics.{metric_name}={value}"
+                for metric_name, value in named_metrics
+                if isinstance(metric_name, str) and isinstance(value, float)
+            ]
+        )
+        header["endpoint-load-metrics"] = f"TEXT {native_http_header}"
+
+    # output example:
+    # endpoint-load-metrics: JSON “named_metrics”: {“custom-metric-util”: 0.4}
+    elif metrics_format.lower() == "json":
+        header["endpoint-load-metrics"] = f"JSON {json.dumps(orca_report)}"
+
+    logger.info("Created ORCA header %s", header)
+
+    return header
+
+
+def get_named_metrics_from_prometheus() -> list[tuple[str, float]]:
+    """
+    Collects current metrics from Prometheus and returns some of them
+    in the form of the `named_metrics` list for `create_orca_header()`.
+
+    Parameters:
+    - None
+
+    Returns:
+    - list[tuple[str, float]]: List of tuples of metric names and their values.
+    """
+    named_metrics: list[tuple[str, float]] = []
+    # Map from prometheus metric names to ORCA named metrics.
+    prometheus_to_orca_metrics = {
+        "vllm:kv_cache_usage_perc": "kv_cache_usage_perc",
+        "vllm:num_requests_waiting": "num_requests_waiting",
+    }
+    metrics = get_metrics_snapshot()
+    for metric in metrics:
+        orca_name = prometheus_to_orca_metrics.get(metric.name)
+        # If this metric is mapped into ORCA, then add it to the report.
+        # Note: Only Gauge metrics are currently supported.
+        if orca_name is not None and isinstance(metric, Gauge):
+            named_metrics.append((str(orca_name), float(metric.value)))
+    return named_metrics
+
+
+def metrics_header(metrics_format: str) -> Mapping[str, str] | None:
+    """
+    Creates ORCA headers named 'endpoint-load-metrics' in the specified format.
+    Metrics are collected from Prometheus using `get_named_metrics_from_prometheus()`.
+
+    ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0
+    ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto
+
+    Parameters:
+    - metrics_format (str): The format of the header ('TEXT', 'JSON').
+
+    Returns:
+    - Optional[Mapping[str,str]]: A dictionary with header key as
+    'endpoint-load-metrics' and values as the ORCA header strings with
+    format prefix and data in  with named_metrics in.
+    """
+    if not metrics_format:
+        return None
+    # Get named metrics from prometheus.
+    named_metrics = get_named_metrics_from_prometheus()
+    return create_orca_header(metrics_format, named_metrics)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index d0061f9d5b40f..69e757d4764d2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -79,7 +79,6 @@ from pydantic import (
     model_validator,
 )
 
-from vllm import envs
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
 from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.logger import init_logger
@@ -475,16 +474,12 @@ class ResponsesRequest(OpenAIBaseModel):
 
     @model_validator(mode="before")
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0."
-                )
-            if not isinstance(data["cache_salt"], str) or not data["cache_salt"]:
-                raise ValueError(
-                    "Parameter 'cache_salt' must be a non-empty string if provided."
-                )
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
         return data
 
     @model_validator(mode="before")
@@ -777,10 +772,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description="KVTransfer parameters used for disaggregated serving.",
     )
 
-    vllm_xargs: dict[str, str | int | float] | None = Field(
+    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
         default=None,
         description=(
-            "Additional request parameters with string or "
+            "Additional request parameters with (list of) string or "
             "numeric values, used by custom extensions."
         ),
     )
@@ -946,10 +941,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
                 raise ValueError("`prompt_logprobs` must be a positive value or -1.")
-            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "`prompt_logprobs=-1` is only supported with vLLM engine V1."
-                )
         if (top_logprobs := data.get("top_logprobs")) is not None:
             if top_logprobs < 0 and top_logprobs != -1:
                 raise ValueError("`top_logprobs` must be a positive value or -1.")
@@ -1083,16 +1074,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     @classmethod
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0."
-                )
-            if not isinstance(data["cache_salt"], str) or not data["cache_salt"]:
-                raise ValueError(
-                    "Parameter 'cache_salt' must be a non-empty string if provided."
-                )
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
         return data
 
 
@@ -1449,10 +1436,6 @@ class CompletionRequest(OpenAIBaseModel):
 
             if prompt_logprobs < 0 and prompt_logprobs != -1:
                 raise ValueError("`prompt_logprobs` must be a positive value or -1.")
-            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "`prompt_logprobs=-1` is only supported with vLLM engine V1."
-                )
         if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
             raise ValueError("`logprobs` must be a positive value.")
 
@@ -1487,16 +1470,12 @@ class CompletionRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     @classmethod
     def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None:
-            if not envs.VLLM_USE_V1:
-                raise ValueError(
-                    "Parameter 'cache_salt' is not supported with "
-                    "this instance of vLLM, which uses engine V0."
-                )
-            if not isinstance(data["cache_salt"], str) or not data["cache_salt"]:
-                raise ValueError(
-                    "Parameter 'cache_salt' must be a non-empty string if provided."
-                )
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
         return data
 
 
@@ -2123,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel):
     tool_calls: list[ToolCall] = Field(default_factory=list)
 
     # vLLM-specific fields that are not in OpenAI spec
+    reasoning: str | None = None
     reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
+
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
 
 
 class ChatCompletionLogProb(OpenAIBaseModel):
@@ -2177,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: str | None = None
     content: str | None = None
+    reasoning: str | None = None
     reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
     tool_calls: list[DeltaToolCall] = Field(default_factory=list)
 
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
+
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 4caccf88fd7d7..4b9dba085a8e9 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -334,13 +334,13 @@ async def run_request(
 
 
 def validate_run_batch_args(args):
-    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    valid_reasoning_parsers = ReasoningParserManager.list_registered()
     if (
         reasoning_parser := args.structured_outputs_config.reasoning_parser
-    ) and reasoning_parser not in valid_reasoning_parses:
+    ) and reasoning_parser not in valid_reasoning_parsers:
         raise KeyError(
             f"invalid reasoning parser: {reasoning_parser} "
-            f"(chose from {{ {','.join(valid_reasoning_parses)} }})"
+            f"(chose from {{ {','.join(valid_reasoning_parsers)} }})"
         )
 
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index bb770ecf03383..59e1c8d531793 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -13,7 +13,6 @@ import partial_json_parser
 import regex as re
 from fastapi import Request
 from openai_harmony import Message as OpenAIMessage
-from pydantic import TypeAdapter
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
@@ -47,8 +46,6 @@ from vllm.entrypoints.openai.protocol import (
     DeltaMessage,
     DeltaToolCall,
     ErrorResponse,
-    FunctionCall,
-    FunctionDefinition,
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     ToolCall,
@@ -71,6 +68,7 @@ from vllm.transformers_utils.tokenizers import (
     validate_request_params,
 )
 from vllm.utils.collection_utils import as_list
+from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
 logger = init_logger(__name__)
 
@@ -110,6 +108,9 @@ class OpenAIServingChat(OpenAIServing):
         self.trust_request_chat_template = trust_request_chat_template
         self.enable_log_outputs = enable_log_outputs
 
+        # set up logits processors
+        self.logits_processors = self.model_config.logits_processors
+
         # set up reasoning parser
         self.reasoning_parser = self._get_reasoning_parser(
             reasoning_parser_name=reasoning_parser
@@ -294,6 +295,10 @@ class OpenAIServingChat(OpenAIServing):
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 self._log_inputs(
                     request_id,
@@ -754,9 +759,7 @@ class OpenAIServingChat(OpenAIServing):
                             delta_message = DeltaMessage(content=delta_text)
                         elif cur_channel == "analysis":
                             if request.include_reasoning:
-                                delta_message = DeltaMessage(
-                                    reasoning_content=delta_text
-                                )
+                                delta_message = DeltaMessage(reasoning=delta_text)
                             else:
                                 delta_message = None
                         elif (
@@ -818,7 +821,7 @@ class OpenAIServingChat(OpenAIServing):
                         ):
                             assert reasoning_parser is not None
                             delta_message = (
-                                reasoning_parser.extract_reasoning_content_streaming(
+                                reasoning_parser.extract_reasoning_streaming(
                                     previous_text,
                                     current_text,
                                     delta_text,
@@ -831,7 +834,7 @@ class OpenAIServingChat(OpenAIServing):
                             # or think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
                             # set reasoning status to end.
-                            # Only keep 'content', remove 'reasoning_content'.
+                            # Only keep 'content', remove 'reasoning'.
                             if reasoning_parser.is_reasoning_end(
                                 as_list(output.token_ids)
                             ) or (
@@ -894,7 +897,7 @@ class OpenAIServingChat(OpenAIServing):
 
                         if self.reasoning_parser and not reasoning_end_arr[i]:
                             delta_message = (
-                                reasoning_parser.extract_reasoning_content_streaming(
+                                reasoning_parser.extract_reasoning_streaming(
                                     previous_text,
                                     current_text,
                                     delta_text,
@@ -943,7 +946,7 @@ class OpenAIServingChat(OpenAIServing):
                         output_token_ids = as_list(output.token_ids)
                         if not reasoning_end_arr[i]:
                             delta_message = (
-                                reasoning_parser.extract_reasoning_content_streaming(
+                                reasoning_parser.extract_reasoning_streaming(
                                     previous_text,
                                     current_text,
                                     delta_text,
@@ -956,7 +959,7 @@ class OpenAIServingChat(OpenAIServing):
                             # i.e {"enable_thinking": False},
                             # set reasoning status to end.
                             # Remove the text and token ids related
-                            # to 'reasoning_content'.
+                            # to 'reasoning'.
                             if (
                                 res.prompt_token_ids
                                 and reasoning_parser.is_reasoning_end(
@@ -973,7 +976,7 @@ class OpenAIServingChat(OpenAIServing):
                             # When encountering think end id in delta_token_ids,
                             # set reasoning status to end.
                             # Remove the text and token ids related
-                            # to 'reasoning_content'.
+                            # to 'reasoning'.
                             if reasoning_parser.is_reasoning_end(output_token_ids):
                                 reasoning_end_arr[i] = True
                                 current_token_ids = (
@@ -1028,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing):
 
                     # when only reasoning
                     elif self.reasoning_parser:
-                        delta_message = (
-                            reasoning_parser.extract_reasoning_content_streaming(
-                                previous_text,
-                                current_text,
-                                delta_text,
-                                previous_token_ids,
-                                current_token_ids,
-                                output.token_ids,
-                            )
+                        delta_message = reasoning_parser.extract_reasoning_streaming(
+                            previous_text,
+                            current_text,
+                            delta_text,
+                            previous_token_ids,
+                            current_token_ids,
+                            output.token_ids,
                         )
                     # handle streaming just a content delta
                     else:
@@ -1170,9 +1171,13 @@ class OpenAIServingChat(OpenAIServing):
                             )
 
                         # Send the finish response for each request.n only once
+                        # In OpenAI's API, when a tool is called, the
+                        # finish_reason is:
+                        # "tool_calls" for "auto" or "required" tool calls,
+                        # and "stop" for named tool calls.
                         if (
                             auto_tools_called
-                            or tools_streamed[i]
+                            or (tools_streamed[i] and not tool_choice_function_name)
                             or (self.use_harmony and harmony_tools_streamed[i])
                         ):
                             finish_reason_ = "tool_calls"
@@ -1325,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs = None
 
             if self.use_harmony:
-                reasoning_content, content, _ = parse_chat_output(token_ids)
+                reasoning, content, _ = parse_chat_output(token_ids)
                 if not request.include_reasoning:
-                    reasoning_content = None
+                    reasoning = None
 
                 if self.tool_parser is not None:
                     tool_parser = self.tool_parser(tokenizer)
@@ -1340,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing):
                     content = tool_call_info.content
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
+                        reasoning=reasoning,
                         content=content,
                         tool_calls=tool_call_info.tool_calls,
                     )
                 else:
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
+                        reasoning=reasoning,
                         content=content,
                     )
 
@@ -1363,6 +1368,9 @@ class OpenAIServingChat(OpenAIServing):
                         else "stop"
                     ),
                     stop_reason=output.stop_reason,
+                    token_ids=(
+                        as_list(output.token_ids) if request.return_token_ids else None
+                    ),
                 )
                 choices.append(choice_data)
                 continue
@@ -1378,97 +1386,73 @@ class OpenAIServingChat(OpenAIServing):
                     return self.create_error_response(str(e))
                 # If the reasoning parser is enabled,
                 # tool calls are extracted exclusively from the content.
-                reasoning_content, content = reasoning_parser.extract_reasoning_content(
+                reasoning, content = reasoning_parser.extract_reasoning(
                     output.text, request=request
                 )
                 if not request.include_reasoning:
-                    reasoning_content = None
+                    reasoning = None
             else:
-                reasoning_content = None
+                reasoning = None
                 content = output.text
 
             auto_tools_called = False
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
+            tool_calls, content = self._parse_tool_calls_from_content(
+                request=request,
+                tokenizer=tokenizer,
+                content=content,
+                enable_auto_tools=self.enable_auto_tools,
+                tool_parser_cls=self.tool_parser,
+            )
+            tool_call_class = (
+                MistralToolCall if isinstance(tokenizer, MistralTokenizer) else ToolCall
+            )
             if (not self.enable_auto_tools or not self.tool_parser) and (
                 not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
                 and request.tool_choice != "required"
             ):
-                message = ChatMessage(
-                    role=role, reasoning_content=reasoning_content, content=content
-                )
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
 
             # if the request uses tools and specified a tool choice
             elif (
                 request.tool_choice
                 and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam
             ):
-                tool_call_class = (
-                    MistralToolCall
-                    if isinstance(tokenizer, MistralTokenizer)
-                    else ToolCall
-                )
+                assert tool_calls is not None and len(tool_calls) > 0
                 message = ChatMessage(
                     role=role,
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content="",
-                    tool_calls=[
-                        tool_call_class(
-                            function=FunctionCall(
-                                name=request.tool_choice.function.name,
-                                arguments=content,
-                            )
-                        )
-                    ],
+                    tool_calls=[tool_call_class(function=tc) for tc in tool_calls],
                 )
 
             elif request.tool_choice and request.tool_choice == "required":
-                tool_call_class = (
-                    MistralToolCall
-                    if isinstance(tokenizer, MistralTokenizer)
-                    else ToolCall
-                )
-
-                # the fields of FunctionDefinition are a superset of the
-                # tool call outputs and can be used for parsing
-                assert content is not None
-                tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
-                    content
-                )
-                tool_call_ids = []
+                tool_call_class_items = []
+                assert tool_calls is not None and len(tool_calls) > 0
                 for tool_call in tool_calls:
-                    tool_call_ids.append(
-                        make_tool_call_id(
-                            id_type=self.tool_call_id_type,
-                            func_name=tool_call.name,
-                            idx=history_tool_call_cnt,
+                    tool_call_class_items.append(
+                        tool_call_class(
+                            id=make_tool_call_id(
+                                id_type=self.tool_call_id_type,
+                                func_name=tool_call.name,
+                                idx=history_tool_call_cnt,
+                            ),
+                            function=tool_call,
                         )
                     )
                     history_tool_call_cnt += 1
                 message = ChatMessage(
                     role=role,
                     content="",
-                    tool_calls=[
-                        tool_call_class(
-                            id=tool_call_ids[i],
-                            function=FunctionCall(
-                                name=tool_call.name,
-                                arguments=json.dumps(
-                                    tool_call.parameters, ensure_ascii=False
-                                ),
-                            ),
-                        )
-                        for i, tool_call in enumerate(tool_calls)
-                    ],
-                    reasoning_content=reasoning_content,
+                    tool_calls=tool_call_class_items,
+                    reasoning=reasoning,
                 )
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
-                message = ChatMessage(
-                    role=role, reasoning_content=reasoning_content, content=content
-                )
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
 
             # handle when there are tools and tool choice is auto
             elif (
@@ -1477,25 +1461,22 @@ class OpenAIServingChat(OpenAIServing):
                 and self.enable_auto_tools
                 and self.tool_parser
             ):
-                try:
-                    tool_parser = self.tool_parser(tokenizer)
-                except RuntimeError as e:
-                    logger.exception("Error in tool parser creation.")
-                    return self.create_error_response(str(e))
-
-                tool_call_info = tool_parser.extract_tool_calls(
-                    content if content is not None else "", request=request
-                )
                 # In the OpenAI API the finish_reason is "tools_called"
                 # if the tool choice is auto and the model produced a tool
                 # call. The same is not true for named function calls
-                auto_tools_called = tool_call_info.tools_called
-                if tool_call_info.tools_called:
+                auto_tools_called = tool_calls is not None and len(tool_calls) > 0
+                if tool_calls:
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
-                        content=tool_call_info.content,
-                        tool_calls=tool_call_info.tool_calls,
+                        reasoning=reasoning,
+                        content=content,
+                        tool_calls=[
+                            ToolCall(
+                                function=tc,
+                                type="function",
+                            )
+                            for tc in tool_calls
+                        ],
                     )
 
                 else:
@@ -1505,11 +1486,11 @@ class OpenAIServingChat(OpenAIServing):
 
                     # try to use content return from tool parser first,
                     # tool parser may do some modify for the content.
-                    if tool_call_info.content and len(tool_call_info.content) > 0:
-                        ret_content = tool_call_info.content
+                    if content and len(content) > 0:
+                        ret_content = content
                     message = ChatMessage(
                         role=role,
-                        reasoning_content=reasoning_content,
+                        reasoning=reasoning,
                         content=ret_content,
                     )
 
@@ -1520,21 +1501,25 @@ class OpenAIServingChat(OpenAIServing):
                     " if tools should be extracted. Returning a standard chat "
                     "completion."
                 )
-                message = ChatMessage(
-                    role=role, reasoning_content=reasoning_content, content=content
-                )
+                message = ChatMessage(role=role, reasoning=reasoning, content=content)
+            # In OpenAI's API, when a tool is called, the finish_reason is:
+            # "tool_calls" for "auto" or "required" tool calls,
+            # and "stop" for named tool calls.
+            is_finish_reason_tool_calls = auto_tools_called or (
+                request.tool_choice
+                and request.tool_choice == "required"
+                and output.finish_reason == "stop"
+            )
 
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason=(
-                    "tool_calls"
-                    if auto_tools_called
-                    else output.finish_reason
-                    if output.finish_reason
-                    else "stop"
-                ),
+                finish_reason="tool_calls"
+                if is_finish_reason_tool_calls
+                else output.finish_reason
+                if output.finish_reason
+                else "stop",
                 stop_reason=output.stop_reason,
                 token_ids=(
                     as_list(output.token_ids) if request.return_token_ids else None
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 14dbdd4cb4c7c..a114b77ebc16b 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -36,6 +36,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.async_utils import merge_async_iterators
 from vllm.utils.collection_utils import as_list
+from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
 
 logger = init_logger(__name__)
 
@@ -59,6 +60,10 @@ class OpenAIServingCompletion(OpenAIServing):
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             log_error_stack=log_error_stack,
         )
+
+        # set up logits processors
+        self.logits_processors = self.model_config.logits_processors
+
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.default_sampling_params = self.model_config.get_diff_sampling_param()
         self.enable_force_include_usage = enable_force_include_usage
@@ -181,6 +186,10 @@ class OpenAIServingCompletion(OpenAIServing):
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 46e79edbde611..30b8499b08d5b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -12,7 +12,7 @@ from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import torch
 from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
@@ -21,6 +21,10 @@ if sys.version_info >= (3, 12):
 else:
     from typing_extensions import TypedDict
 
+from openai.types.responses import (
+    ToolChoiceFunction,
+)
+
 import vllm.envs as envs
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.engine.protocol import EngineClient
@@ -36,6 +40,7 @@ from vllm.entrypoints.chat_utils import (
 from vllm.entrypoints.context import ConversationContext
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
     ClassificationRequest,
@@ -49,6 +54,8 @@ from vllm.entrypoints.openai.protocol import (
     EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
+    FunctionCall,
+    FunctionDefinition,
     IOProcessorRequest,
     PoolingResponse,
     RerankRequest,
@@ -1091,13 +1098,13 @@ class OpenAIServing:
         )
 
         if should_parse_tools:
-            if not isinstance(request, ChatCompletionRequest):
-                msg = "Tool usage is only supported for Chat Completions API"
+            if not isinstance(request, ChatCompletionRequest | ResponsesRequest):
+                msg = (
+                    "Tool usage is only supported for Chat Completions API "
+                    "or Responses API requests."
+                )
                 raise NotImplementedError(msg)
-
-            request = tool_parser(tokenizer).adjust_request(  # type: ignore
-                request=request
-            )
+            request = tool_parser(tokenizer).adjust_request(request=request)  # type: ignore
 
         if tokenizer is None:
             assert isinstance(request_prompt, str), (
@@ -1305,6 +1312,77 @@ class OpenAIServing:
         except ValueError:
             return None
 
+    @staticmethod
+    def _parse_tool_calls_from_content(
+        request: ResponsesRequest | ChatCompletionRequest,
+        tokenizer: AnyTokenizer,
+        enable_auto_tools: bool,
+        tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+        content: str | None = None,
+    ) -> tuple[list[FunctionCall] | None, str | None]:
+        function_calls = list[FunctionCall]()
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            assert content is not None
+            # Forced Function Call
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.name, arguments=content)
+            )
+            content = None  # Clear content since tool is called.
+        elif request.tool_choice and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam
+        ):
+            assert content is not None
+            # Forced Function Call
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.function.name, arguments=content)
+            )
+            content = None  # Clear content since tool is called.
+        elif request.tool_choice == "required":
+            assert content is not None
+            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
+            function_calls.extend(
+                [
+                    FunctionCall(
+                        name=tool_call.name,
+                        arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+                    )
+                    for tool_call in tool_calls
+                ]
+            )
+            content = None  # Clear content since tool is called.
+        elif (
+            tool_parser_cls
+            and enable_auto_tools
+            and (request.tool_choice == "auto" or request.tool_choice is None)
+        ):
+            # Automatic Tool Call Parsing
+            try:
+                tool_parser = tool_parser_cls(tokenizer)
+            except RuntimeError as e:
+                logger.exception("Error in tool parser creation.")
+                raise e
+            tool_call_info = tool_parser.extract_tool_calls(
+                content if content is not None else "",
+                request=request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                # extract_tool_calls() returns a list of tool calls.
+                function_calls.extend(
+                    FunctionCall(
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                content = tool_call_info.content
+                if content and content.strip() == "":
+                    content = None
+            else:
+                # No tool calls.
+                return None, content
+
+        return function_calls, content
+
     @staticmethod
     def _get_decoded_token(
         logprob: Logprob,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 2ee8de5fba07a..9b79e50c32085 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -196,16 +197,12 @@ class OpenAIServingResponses(OpenAIServing):
             self.default_sampling_params["stop_token_ids"].extend(
                 get_stop_tokens_for_assistant_actions()
             )
-
+        self.enable_auto_tools = enable_auto_tools
         # set up tool use
-        self.enable_auto_tools: bool = enable_auto_tools
-        if self.enable_auto_tools:
-            logger.info(
-                '"auto" tool choice has been enabled please note that while'
-                " the parallel_tool_calls client option is preset for "
-                "compatibility reasons, it will be ignored."
-            )
-
+        self.tool_parser = self._get_tool_parser(
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
+        )
+        self.exclude_tools_when_tool_choice_none = False
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove responses from the store.
@@ -511,16 +508,20 @@ class OpenAIServingResponses(OpenAIServing):
         prev_response: ResponsesResponse | None,
         tokenizer: AnyTokenizer,
     ):
-        if len(request.tools) > 0:
-            raise NotImplementedError(
-                "Tool use is not supported in Responses API without Harmony"
-            )
+        if request.tools is None or (
+            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
+        ):
+            tool_dicts = None
+        else:
+            tool_dicts = [tool.model_dump() for tool in request.tools]
         # Construct the input messages.
         messages = self._construct_input_messages(request, prev_response)
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
             messages,
+            tool_dicts=tool_dicts,
+            tool_parser=self.tool_parser,
             chat_template=self.chat_template,
             chat_template_content_format=self.chat_template_content_format,
         )
@@ -777,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing):
                 logger.exception("Error in reasoning parser creation.")
                 raise e
 
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(
+            reasoning, content = reasoning_parser.extract_reasoning(
                 final_output.text, request=request
             )
         else:
-            reasoning_content = None
+            reasoning = None
             content = final_output.text
 
         # Log complete response if output logging is enabled
@@ -789,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing):
             output_text = ""
             if content:
                 output_text = content
-            elif reasoning_content:
-                output_text = f"[reasoning: {reasoning_content}]"
+            elif reasoning:
+                output_text = f"[reasoning: {reasoning}]"
 
             if output_text:
                 self.request_logger.log_outputs(
@@ -802,20 +803,25 @@ class OpenAIServingResponses(OpenAIServing):
                     delta=False,
                 )
 
-        output = []
-        if reasoning_content:
+        reasoning_item = None
+        message_item = None
+        if reasoning:
             reasoning_item = ResponseReasoningItem(
                 id=f"rs_{random_uuid()}",
                 summary=[],
                 type="reasoning",
                 content=[
-                    ResponseReasoningTextContent(
-                        text=reasoning_content, type="reasoning_text"
-                    )
+                    ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
                 ],
                 status=None,  # NOTE: Only the last output item has status.
             )
-            output.append(reasoning_item)
+        tool_calls, content = self._parse_tool_calls_from_content(
+            request=request,
+            tokenizer=tokenizer,
+            content=content,
+            enable_auto_tools=self.enable_auto_tools,
+            tool_parser_cls=self.tool_parser,
+        )
         if content:
             output_text = ResponseOutputText(
                 text=content,
@@ -832,15 +838,33 @@ class OpenAIServingResponses(OpenAIServing):
                     else None
                 ),
             )
-            message = ResponseOutputMessage(
+            message_item = ResponseOutputMessage(
                 id=f"msg_{random_uuid()}",
                 content=[output_text],
                 role="assistant",
                 status="completed",
                 type="message",
             )
-            output.append(message)
-        return output
+        outputs = []
+
+        if reasoning_item:
+            outputs.append(reasoning_item)
+        if message_item:
+            outputs.append(message_item)
+        if tool_calls:
+            tool_call_items = [
+                ResponseFunctionToolCall(
+                    id=f"fc_{random_uuid()}",
+                    call_id=f"call_{random_uuid()}",
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                )
+                for tool_call in tool_calls
+            ]
+            outputs.extend(tool_call_items)
+        return outputs
 
     def _make_response_output_items_with_harmony(
         self,
@@ -893,7 +917,8 @@ class OpenAIServingResponses(OpenAIServing):
         if isinstance(request.input, str):
             messages.append({"role": "user", "content": request.input})
         else:
-            messages.extend(request.input)  # type: ignore
+            for item in request.input:
+                messages.append(construct_chat_message_with_tool_call(item))
         return messages
 
     def _construct_harmony_system_input_message(
@@ -1000,11 +1025,6 @@ class OpenAIServingResponses(OpenAIServing):
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
                 # parsing the tool call output.
-                if (
-                    isinstance(response_msg, dict)
-                    and response_msg.get("type") == "function_call"
-                ):
-                    response_msg = ResponseFunctionToolCall.model_validate(response_msg)
                 if isinstance(response_msg, ResponseFunctionToolCall):
                     prev_outputs.append(response_msg)
         return messages
@@ -1186,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing):
             if ctx.last_output.outputs:
                 output = ctx.last_output.outputs[0]
                 if reasoning_parser:
-                    delta_message = (
-                        reasoning_parser.extract_reasoning_content_streaming(
-                            previous_text=previous_text,
-                            current_text=previous_text + output.text,
-                            delta_text=output.text,
-                            previous_token_ids=previous_token_ids,
-                            current_token_ids=previous_token_ids + output.token_ids,
-                            delta_token_ids=output.token_ids,
-                        )
+                    delta_message = reasoning_parser.extract_reasoning_streaming(
+                        previous_text=previous_text,
+                        current_text=previous_text + output.text,
+                        delta_text=output.text,
+                        previous_token_ids=previous_token_ids,
+                        current_token_ids=previous_token_ids + output.token_ids,
+                        delta_token_ids=output.token_ids,
                     )
                 else:
                     delta_message = DeltaMessage(
@@ -1206,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing):
                     continue
                 if not first_delta_sent:
                     current_item_id = str(uuid.uuid4())
-                    if delta_message.reasoning_content:
+                    if delta_message.reasoning:
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemAddedEvent(
                                 type="response.output_item.added",
@@ -1258,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing):
                 # same as content or reasoning content
                 if (
                     previous_delta_messages
-                    and previous_delta_messages[-1].reasoning_content is not None
+                    and previous_delta_messages[-1].reasoning is not None
                     and delta_message.content is not None
                 ):
                     # from reasoning to normal content, send done
                     # event for reasoning
                     reason_content = "".join(
-                        pm.reasoning_content
+                        pm.reasoning
                         for pm in previous_delta_messages
-                        if pm.reasoning_content is not None
+                        if pm.reasoning is not None
                     )
                     yield _increment_sequence_number_and_return(
                         ResponseReasoningTextDoneEvent(
@@ -1334,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing):
                     # reset previous delta messages
                     previous_delta_messages = []
 
-                if delta_message.reasoning_content is not None:
+                if delta_message.reasoning is not None:
                     yield _increment_sequence_number_and_return(
                         ResponseReasoningTextDeltaEvent(
                             type="response.reasoning_text.delta",
@@ -1342,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing):
                             content_index=current_content_index,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                            delta=delta_message.reasoning_content,
+                            delta=delta_message.reasoning,
                         )
                     )
                 elif delta_message.content is not None:
@@ -1370,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing):
 
                 previous_delta_messages.append(delta_message)
         if previous_delta_messages:
-            if previous_delta_messages[-1].reasoning_content is not None:
+            if previous_delta_messages[-1].reasoning is not None:
                 reason_content = "".join(
-                    pm.reasoning_content
+                    pm.reasoning
                     for pm in previous_delta_messages
-                    if pm.reasoning_content is not None
+                    if pm.reasoning is not None
                 )
                 yield _increment_sequence_number_and_return(
                     ResponseReasoningTextDoneEvent(
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 46139642c50c1..b9b9b1ab30ad8 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -170,11 +170,6 @@ class OpenAISpeechToText(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            if lora_request:
-                return self.create_error_response(
-                    f"Currently do not support LoRA for {self.task_type.title()}."
-                )
-
             prompts, duration_s = await self._preprocess_speech_to_text(
                 request=request,
                 audio_data=audio_data,
@@ -199,7 +194,7 @@ class OpenAISpeechToText(OpenAIServing):
                 # It will not display special tokens like <|startoftranscript|>
                 request.prompt,
                 params=sampling_params,
-                lora_request=None,
+                lora_request=lora_request,
             )
 
             list_result_generator = [
@@ -207,6 +202,7 @@ class OpenAISpeechToText(OpenAIServing):
                     prompt,
                     sampling_params,
                     request_id,
+                    lora_request=lora_request,
                 )
                 for prompt in prompts
             ]
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 4541ca50822f7..89e439dd53f5f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,61 +1,142 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .abstract_tool_parser import ToolParser, ToolParserManager
-from .deepseekv3_tool_parser import DeepSeekV3ToolParser
-from .deepseekv31_tool_parser import DeepSeekV31ToolParser
-from .ernie45_tool_parser import Ernie45ToolParser
-from .glm4_moe_tool_parser import Glm4MoeModelToolParser
-from .granite_20b_fc_tool_parser import Granite20bFCToolParser
-from .granite_tool_parser import GraniteToolParser
-from .hermes_tool_parser import Hermes2ProToolParser
-from .hunyuan_a13b_tool_parser import HunyuanA13BToolParser
-from .internlm2_tool_parser import Internlm2ToolParser
-from .jamba_tool_parser import JambaToolParser
-from .kimi_k2_tool_parser import KimiK2ToolParser
-from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
-from .llama_tool_parser import Llama3JsonToolParser
-from .longcat_tool_parser import LongcatFlashToolParser
-from .minimax_m2_tool_parser import MinimaxM2ToolParser
-from .minimax_tool_parser import MinimaxToolParser
-from .mistral_tool_parser import MistralToolParser
-from .olmo3_tool_parser import Olmo3PythonicToolParser
-from .openai_tool_parser import OpenAIToolParser
-from .phi4mini_tool_parser import Phi4MiniJsonToolParser
-from .pythonic_tool_parser import PythonicToolParser
-from .qwen3coder_tool_parser import Qwen3CoderToolParser
-from .qwen3xml_tool_parser import Qwen3XMLToolParser
-from .seed_oss_tool_parser import SeedOssToolParser
-from .step3_tool_parser import Step3ToolParser
-from .xlam_tool_parser import xLAMToolParser
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
 
-__all__ = [
-    "ToolParser",
-    "ToolParserManager",
-    "Granite20bFCToolParser",
-    "GraniteToolParser",
-    "Hermes2ProToolParser",
-    "MistralToolParser",
-    "Internlm2ToolParser",
-    "Llama3JsonToolParser",
-    "JambaToolParser",
-    "Llama4PythonicToolParser",
-    "LongcatFlashToolParser",
-    "PythonicToolParser",
-    "Phi4MiniJsonToolParser",
-    "DeepSeekV3ToolParser",
-    "DeepSeekV31ToolParser",
-    "Ernie45ToolParser",
-    "xLAMToolParser",
-    "Olmo3PythonicToolParser",
-    "MinimaxToolParser",
-    "KimiK2ToolParser",
-    "HunyuanA13BToolParser",
-    "Glm4MoeModelToolParser",
-    "Qwen3CoderToolParser",
-    "Qwen3XMLToolParser",
-    "SeedOssToolParser",
-    "Step3ToolParser",
-    "OpenAIToolParser",
-    "MinimaxM2ToolParser",
-]
+__all__ = ["ToolParser", "ToolParserManager"]
+
+
+"""
+Register a lazy module mapping.
+
+Example:
+    ToolParserManager.register_lazy_module(
+        name="kimi_k2",
+        module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
+        class_name="KimiK2ToolParser",
+    )
+"""
+
+
+_TOOL_PARSERS_TO_REGISTER = {
+    "deepseek_v3": (  # name
+        "deepseekv3_tool_parser",  # filename
+        "DeepSeekV3ToolParser",  # class_name
+    ),
+    "deepseek_v31": (
+        "deepseekv31_tool_parser",
+        "DeepSeekV31ToolParser",
+    ),
+    "ernie45": (
+        "ernie45_tool_parser",
+        "Ernie45ToolParser",
+    ),
+    "glm45": (
+        "glm4_moe_tool_parser",
+        "Glm4MoeModelToolParser",
+    ),
+    "granite-20b-fc": (
+        "granite_20b_fc_tool_parser",
+        "Granite20bFCToolParser",
+    ),
+    "granite": (
+        "granite_tool_parser",
+        "GraniteToolParser",
+    ),
+    "hermes": (
+        "hermes_tool_parser",
+        "Hermes2ProToolParser",
+    ),
+    "hunyuan_a13b": (
+        "hunyuan_a13b_tool_parser",
+        "HunyuanA13BToolParser",
+    ),
+    "internlm": (
+        "internlm2_tool_parser",
+        "Internlm2ToolParser",
+    ),
+    "jamba": (
+        "jamba_tool_parser",
+        "JambaToolParser",
+    ),
+    "kimi_k2": (
+        "kimi_k2_tool_parser",
+        "KimiK2ToolParser",
+    ),
+    "llama3_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_json": (
+        "llama_tool_parser",
+        "Llama3JsonToolParser",
+    ),
+    "llama4_pythonic": (
+        "llama4_pythonic_tool_parser",
+        "Llama4PythonicToolParser",
+    ),
+    "longcat": (
+        "longcat_tool_parser",
+        "LongcatFlashToolParser",
+    ),
+    "minimax_m2": (
+        "minimax_m2_tool_parser",
+        "MinimaxM2ToolParser",
+    ),
+    "minimax": (
+        "minimax_tool_parser",
+        "MinimaxToolParser",
+    ),
+    "mistral": (
+        "mistral_tool_parser",
+        "MistralToolParser",
+    ),
+    "olmo3": (
+        "olmo3_tool_parser",
+        "Olmo3PythonicToolParser",
+    ),
+    "openai": (
+        "openai_tool_parser",
+        "OpenAIToolParser",
+    ),
+    "phi4_mini_json": (
+        "phi4mini_tool_parser",
+        "Phi4MiniJsonToolParser",
+    ),
+    "pythonic": (
+        "pythonic_tool_parser",
+        "PythonicToolParser",
+    ),
+    "qwen3_coder": (
+        "qwen3coder_tool_parser",
+        "Qwen3CoderToolParser",
+    ),
+    "qwen3_xml": (
+        "qwen3xml_tool_parser",
+        "Qwen3XMLToolParser",
+    ),
+    "seed_oss": (
+        "seed_oss_tool_parser",
+        "SeedOssToolParser",
+    ),
+    "step3": (
+        "step3_tool_parser",
+        "Step3ToolParser",
+    ),
+    "xlam": (
+        "xlam_tool_parser",
+        "xLAMToolParser",
+    ),
+}
+
+
+def register_lazy_tool_parsers():
+    for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
+        module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}"
+        ToolParserManager.register_lazy_module(name, module_path, class_name)
+
+
+register_lazy_tool_parsers()
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 212326fdafb1e..e99e405f5de65 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,14 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
 import os
 from collections.abc import Callable, Sequence
 from functools import cached_property
 
+from openai.types.responses.response_format_text_json_schema_config import (
+    ResponseFormatTextJSONSchemaConfig,
+)
+
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     DeltaMessage,
     ExtractedToolCallInformation,
+    ResponsesRequest,
+    ResponseTextConfig,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.logger import init_logger
@@ -55,11 +62,21 @@ class ToolParser:
         )
         # Set structured output params for tool calling
         if json_schema_from_tool is not None:
-            if request.structured_outputs is None:
+            if isinstance(request, ChatCompletionRequest):
                 request.structured_outputs = StructuredOutputsParams()
-            # tool_choice: "Forced Function" or "required" will override
-            # structured output json settings to make tool calling work correctly
-            request.structured_outputs.json = json_schema_from_tool
+                # tool_choice: "Forced Function" or "required" will override
+                # structured output json settings to make tool calling work correctly
+                request.structured_outputs.json = json_schema_from_tool
+            if isinstance(request, ResponsesRequest):
+                request.text = ResponseTextConfig()
+                request.text.format = ResponseFormatTextJSONSchemaConfig(
+                    name="tool_calling_response",
+                    schema=json_schema_from_tool,
+                    type="json_schema",
+                    description="Response format for tool calling",
+                    strict=True,
+                )
+
         return request
 
     def extract_tool_calls(
@@ -99,89 +116,158 @@ class ToolParser:
 
 
 class ToolParserManager:
-    tool_parsers: dict[str, type] = {}
+    """
+    Central registry for ToolParser implementations.
+
+    Supports two modes:
+      - Eager (immediate) registration via `register_module`
+      - Lazy registration via `register_lazy_module`
+    """
+
+    tool_parsers: dict[str, type[ToolParser]] = {}
+    lazy_parsers: dict[str, tuple[str, str]] = {}  # name -> (module_path, class_name)
 
     @classmethod
-    def get_tool_parser(cls, name) -> type:
+    def get_tool_parser(cls, name: str) -> type[ToolParser]:
         """
-        Get tool parser by name which is registered by `register_module`.
+        Retrieve a registered or lazily registered ToolParser class.
 
-        Raise a KeyError exception if the name is not registered.
+        If the parser is lazily registered,
+        it will be imported and cached on first access.
+        Raises KeyError if not found.
         """
         if name in cls.tool_parsers:
             return cls.tool_parsers[name]
 
-        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
+        if name in cls.lazy_parsers:
+            return cls._load_lazy_parser(name)
+
+        raise KeyError(f"Tool parser '{name}' not found.")
+
+    @classmethod
+    def _load_lazy_parser(cls, name: str) -> type[ToolParser]:
+        """Import and register a lazily loaded parser."""
+        module_path, class_name = cls.lazy_parsers[name]
+        try:
+            mod = importlib.import_module(module_path)
+            parser_cls = getattr(mod, class_name)
+            if not issubclass(parser_cls, ToolParser):
+                raise TypeError(
+                    f"{class_name} in {module_path} is not a ToolParser subclass."
+                )
+            cls.tool_parsers[name] = parser_cls  # cache
+            return parser_cls
+        except Exception as e:
+            logger.exception(
+                "Failed to import lazy tool parser '%s' from %s: %s",
+                name,
+                module_path,
+                e,
+            )
+            raise
 
     @classmethod
     def _register_module(
         cls,
-        module: type,
+        module: type[ToolParser],
         module_name: str | list[str] | None = None,
         force: bool = True,
     ) -> None:
+        """Register a ToolParser class immediately."""
         if not issubclass(module, ToolParser):
             raise TypeError(
                 f"module must be subclass of ToolParser, but got {type(module)}"
             )
+
         if module_name is None:
             module_name = module.__name__
+
         if isinstance(module_name, str):
-            module_name = [module_name]
-        for name in module_name:
+            module_names = [module_name]
+        elif is_list_of(module_name, str):
+            module_names = module_name
+        else:
+            raise TypeError("module_name must be str, list[str], or None.")
+
+        for name in module_names:
             if not force and name in cls.tool_parsers:
-                existed_module = cls.tool_parsers[name]
-                raise KeyError(
-                    f"{name} is already registered at {existed_module.__module__}"
-                )
+                existed = cls.tool_parsers[name]
+                raise KeyError(f"{name} is already registered at {existed.__module__}")
             cls.tool_parsers[name] = module
 
+    @classmethod
+    def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
+        """
+        Register a lazy module mapping.
+
+        Example:
+            ToolParserManager.register_lazy_module(
+                name="kimi_k2",
+                module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
+                class_name="KimiK2ToolParser",
+            )
+        """
+        cls.lazy_parsers[name] = (module_path, class_name)
+
     @classmethod
     def register_module(
         cls,
         name: str | list[str] | None = None,
         force: bool = True,
-        module: type | None = None,
-    ) -> type | Callable:
+        module: type[ToolParser] | None = None,
+    ) -> type[ToolParser] | Callable[[type[ToolParser]], type[ToolParser]]:
         """
-        Register module with the given name or name list. it can be used as a
-        decoder(with module as None) or normal function(with module as not
-        None).
+        Register module immediately or lazily (as a decorator).
+
+        Usage:
+            @ToolParserManager.register_module("kimi_k2")
+            class KimiK2ToolParser(ToolParser):
+                ...
+
+        Or:
+            ToolParserManager.register_module(module=SomeToolParser)
         """
         if not isinstance(force, bool):
             raise TypeError(f"force must be a boolean, but got {type(force)}")
 
-        # raise the error ahead of time
-        if not (name is None or isinstance(name, str) or is_list_of(name, str)):
-            raise TypeError(
-                "name must be None, an instance of str, or a sequence of str, "
-                f"but got {type(name)}"
-            )
-
-        # use it as a normal method: x.register_module(module=SomeClass)
+        # Immediate registration
         if module is not None:
             cls._register_module(module=module, module_name=name, force=force)
             return module
 
-        # use it as a decorator: @x.register_module()
-        def _register(module):
-            cls._register_module(module=module, module_name=name, force=force)
-            return module
+        # Decorator usage
+        def _decorator(obj: type[ToolParser]) -> type[ToolParser]:
+            module_path = obj.__module__
+            class_name = obj.__name__
 
-        return _register
+            if isinstance(name, str):
+                names = [name]
+            elif name is not None and is_list_of(name, str):
+                names = name
+            else:
+                names = [class_name]
+
+            for n in names:
+                # Lazy mapping only: do not import now
+                cls.lazy_parsers[n] = (module_path, class_name)
+
+            return obj
+
+        return _decorator
+
+    @classmethod
+    def list_registered(cls) -> list[str]:
+        """Return names of all eagerly and lazily registered tool parsers."""
+        return sorted(set(cls.tool_parsers.keys()) | set(cls.lazy_parsers.keys()))
 
     @classmethod
     def import_tool_parser(cls, plugin_path: str) -> None:
-        """
-        Import a user-defined tool parser by the path of the tool parser define
-        file.
-        """
-        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+        """Import a user-defined parser file from arbitrary path."""
 
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
         try:
             import_from_path(module_name, plugin_path)
         except Exception:
             logger.exception(
                 "Failed to load module '%s' from %s.", module_name, plugin_path
             )
-            return
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index 14fd5cf0941c6..cbeb879969ece 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("deepseek_v31")
 class DeepSeekV31ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index b256560fb4beb..bf7f6fa61ab90 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("deepseek_v3")
 class DeepSeekV3ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
index e4696334eb135..82370323cb00d 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("ernie45")
 class Ernie45ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         """
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 5081b38240ce6..120e63b929b16 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("glm45")
 class Glm4MoeModelToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index c5246685f4071..ae9217426fb51 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import (
     consume_space,
@@ -35,7 +34,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("granite-20b-fc")
 class Granite20bFCToolParser(ToolParser):
     """
     Tool call parser for the granite-20b-functioncalling model intended
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index cc1f500342353..d29c427694dc9 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import (
     consume_space,
@@ -33,7 +32,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("granite")
 class GraniteToolParser(ToolParser):
     """
     Tool call parser for the granite 3.0 models. Intended
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 6332de42f424e..4336a5438109f 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("hermes")
 class Hermes2ProToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index b32e6e39b3e5c..920675c8389b8 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
@@ -29,7 +28,6 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("hunyuan_a13b")
 class HunyuanA13BToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index c87bab4353b5b..1dd327f645b3a 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
@@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["internlm"])
 class Internlm2ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 21ee2b762cd0a..6f53ddea4f0ef 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -18,7 +18,7 @@ from vllm.entrypoints.openai.protocol import (
     FunctionCall,
     ToolCall,
 )
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -27,7 +27,6 @@ from vllm.transformers_utils.tokenizers import MistralTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("jamba")
 class JambaToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 3fff3b371dbe3..0453db58361a9 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["kimi_k2"])
 class KimiK2ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index dd622b69525de..1d6de9244066e 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 
@@ -31,7 +30,6 @@ class _UnexpectedAstError(Exception):
     pass
 
 
-@ToolParserManager.register_module("llama4_pythonic")
 class Llama4PythonicToolParser(ToolParser):
     """
     Toolcall parser for Llama4 that produce tool calls in a pythonic style
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 8c7b3cefb200e..02fc9b8a4d34e 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import (
     find_common_prefix,
@@ -33,8 +32,6 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("llama3_json")
-@ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
     Tool call parser for Llama 3.x and 4 models intended for use with the
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index 1dc1a0290c8d9..c6c8ae8ae95f1 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -3,12 +3,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 
-@ToolParserManager.register_module("longcat")
 class LongcatFlashToolParser(Hermes2ProToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
index d083ece892d50..5c2258ba62b29 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -27,7 +26,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("minimax_m2")
 class MinimaxM2ToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
@@ -95,7 +93,7 @@ class MinimaxM2ToolParser(ToolParser):
                 "tokens in the tokenizer!"
             )
 
-        logger.info(
+        logger.debug(
             "vLLM Successfully import tool parser %s !", self.__class__.__name__
         )
 
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 4b12bf68b3670..982518a52e3da 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
@@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("minimax")
 class MinimaxToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index dbdf0085367bc..85671271522d3 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -22,7 +22,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
@@ -53,7 +52,6 @@ def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
     )
 
 
-@ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
     """
     Tool call parser for Mistral 7B Instruct v0.3, intended for use with
diff --git a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
index ed5633aac02d4..baff33bd7e8ac 100644
--- a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 
@@ -31,7 +30,6 @@ class _UnexpectedAstError(Exception):
     pass
 
 
-@ToolParserManager.register_module("olmo3")
 class Olmo3PythonicToolParser(ToolParser):
     """
     Tool call parser for Olmo 3 models that produce tool calls as
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index f44876943ac28..d1b36a297e0b1 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -14,7 +14,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 
@@ -26,7 +25,6 @@ else:
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("openai")
 class OpenAIToolParser(ToolParser):
     def __init__(self, tokenizer: "AnyTokenizer"):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index a8387ba1494df..acb25ea2768e1 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -18,14 +18,12 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("phi4_mini_json")
 class Phi4MiniJsonToolParser(ToolParser):
     """
     Tool call parser for phi-4-mini models intended for use with the
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 4945e7b5ab20a..abeb923b93227 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 
@@ -32,7 +31,6 @@ class _UnexpectedAstError(Exception):
     pass
 
 
-@ToolParserManager.register_module("pythonic")
 class PythonicToolParser(ToolParser):
     """
     Tool call parser for models that produce tool calls in a pythonic style,
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index ad56972e6387e..26261c0065ead 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("qwen3_coder")
 class Qwen3CoderToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 9964d1ac25c40..432c419db189a 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -280,7 +279,7 @@ class StreamingXMLToolCallParser:
                     final_delta = DeltaMessage(
                         role=None,
                         content=None,
-                        reasoning_content=None,
+                        reasoning=None,
                         tool_calls=[
                             DeltaToolCall(
                                 index=self.tool_call_index - 1,
@@ -1165,7 +1164,6 @@ class StreamingXMLToolCallParser:
         self.deferred_param_raw_value = ""
 
 
-@ToolParserManager.register_module("qwen3_xml")
 class Qwen3XMLToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index f50a2df53bc04..8aed7f0e9fc96 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -31,7 +30,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("seed_oss")
 class SeedOssToolParser(ToolParser):
     TOOL_CALL_START = "<seed:tool_call>"
     TOOL_CALL_END = "</seed:tool_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index d0255ec085391..adcb9f4765473 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -28,7 +27,6 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["step3"])
 class Step3ToolParser(ToolParser):
     """
     Tool parser for a model that uses a specific XML-like format for tool calls.
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index c1f0d29cc0873..9d308af4de601 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser,
-    ToolParserManager,
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -28,7 +27,6 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module("xlam")
 class xLAMToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
new file mode 100644
index 0000000000000..6eb7c0b70a670
--- /dev/null
+++ b/vllm/entrypoints/responses_utils.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionToolMessageParam,
+)
+from openai.types.chat.chat_completion_message_tool_call_param import (
+    Function as FunctionCallTool,
+)
+from openai.types.responses import ResponseFunctionToolCall
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionMessageParam,
+    ResponseInputOutputItem,
+)
+
+
+def construct_chat_message_with_tool_call(
+    item: ResponseInputOutputItem,
+) -> ChatCompletionMessageParam:
+    if isinstance(item, ResponseFunctionToolCall):
+        # Append the function call as a tool call.
+        return ChatCompletionAssistantMessageParam(
+            role="assistant",
+            tool_calls=[
+                ChatCompletionMessageToolCallParam(
+                    id=item.call_id,
+                    function=FunctionCallTool(
+                        name=item.name,
+                        arguments=item.arguments,
+                    ),
+                    type="function",
+                )
+            ],
+        )
+    elif item.get("type") == "function_call_output":
+        # Append the function call output as a tool message.
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.get("output"),
+            tool_call_id=item.get("call_id"),
+        )
+    return item  # type: ignore
diff --git a/vllm/entrypoints/sagemaker/__init__.py b/vllm/entrypoints/sagemaker/__init__.py
new file mode 100644
index 0000000000000..c1767137e4ea1
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""SageMaker-specific integration for vLLM."""
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
new file mode 100644
index 0000000000000..498b7294f0d8c
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from http import HTTPStatus
+
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+import pydantic
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import (
+    INVOCATION_VALIDATORS,
+    base,
+    health,
+    validate_json_request,
+)
+from vllm.entrypoints.openai.protocol import ErrorResponse
+
+
+def register_sagemaker_routes(router: APIRouter):
+    @router.post("/ping", response_class=Response)
+    @router.get("/ping", response_class=Response)
+    @sagemaker_standards.register_ping_handler
+    async def ping(raw_request: Request) -> Response:
+        """Ping check. Endpoint required for SageMaker"""
+        return await health(raw_request)
+
+    @router.post(
+        "/invocations",
+        dependencies=[Depends(validate_json_request)],
+        responses={
+            HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+            HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
+            HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        },
+    )
+    @sagemaker_standards.register_invocation_handler
+    @sagemaker_standards.stateful_session_manager()
+    @sagemaker_standards.inject_adapter_id(adapter_path="model")
+    async def invocations(raw_request: Request):
+        """For SageMaker, routes requests based on the request type."""
+        try:
+            body = await raw_request.json()
+        except json.JSONDecodeError as e:
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail=f"JSON decode error: {e}",
+            ) from e
+
+        valid_endpoints = [
+            (validator, endpoint)
+            for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
+            if get_handler(raw_request) is not None
+        ]
+
+        for request_validator, endpoint in valid_endpoints:
+            try:
+                request = request_validator.validate_python(body)
+            except pydantic.ValidationError:
+                continue
+
+            return await endpoint(request, raw_request)
+
+        type_names = [
+            t.__name__ if isinstance(t := validator._type, type) else str(t)
+            for validator, _ in valid_endpoints
+        ]
+        msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
+        res = base(raw_request).create_error_response(message=msg)
+        return JSONResponse(content=res.model_dump(), status_code=res.error.code)
+
+    return router
diff --git a/vllm/env_override.py b/vllm/env_override.py
index ae3e4e751bd9f..14dae2850c354 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -272,7 +272,6 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     from torch._inductor.scheduler import (
         BaseSchedulerNode,
         FusedSchedulerNode,
-        _custom_should_partition_fns,
     )
     from torch._inductor.utils import (
         _unstable_customized_partition_wrapper,
@@ -283,9 +282,21 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     # Allow users to manually specify if a node should be partitioned
     # Can only do this for FallbackKernels
     ir_node = node.node
-    if isinstance(ir_node, ir.FallbackKernel):
-        operator = ir_node.op_overload
-        if operator is not None and operator in _custom_should_partition_fns:
+    if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and (
+        op := ir_node.op_overload
+    ):
+        op_overload_packet_name = op.name()
+        op_overload_name = (
+            f"{op_overload_packet_name}.{op._overloadname}"
+            if isinstance(op, torch._ops.OpOverload)
+            else op_overload_packet_name
+        )
+        if (
+            op_overload_packet_name
+            in torch._inductor.config.custom_should_partition_ops
+            or op_overload_name in torch._inductor.config.custom_should_partition_ops
+        ):
+            assert isinstance(op, torch._ops.OpOverload)
             return True
 
     # When not using cudagraphs, keep all kernels in the `call` function
@@ -355,6 +366,13 @@ def _update_scheduler_patched(self) -> None:
 if is_torch_equal("2.9.0"):
     from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
+    from torch.utils._config_module import _Config, _ConfigEntry
+
+    # `custom_should_partition_ops` is a new config after 2.9.0. So this would
+    # not overwrite any user configs.
+    torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry(
+        _Config(default=[])
+    )
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched
diff --git a/vllm/envs.py b/vllm/envs.py
index 21237c70a45e4..52178e5f52500 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
     VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+    VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.8"
@@ -86,6 +87,7 @@ if TYPE_CHECKING:
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: list[str] | None = None
     VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
+    VLLM_TORCH_CUDA_PROFILE: bool = False
     VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
@@ -107,10 +109,11 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
     VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
-    VLLM_ROCM_USE_TRITON_ROPE: bool = False
+    VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -144,6 +147,7 @@ if TYPE_CHECKING:
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = True
+    VLLM_MOE_USE_DEEP_GEMM: bool = True
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
     VLLM_DEEP_GEMM_WARMUP: Literal[
         "skip",
@@ -154,7 +158,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "throughput"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -218,6 +222,8 @@ if TYPE_CHECKING:
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
+    VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_FLAT_LOGPROBS: bool = False
 
 
 def get_default_cache_root():
@@ -737,6 +743,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv(
         "VLLM_VIDEO_LOADER_BACKEND", "opencv"
     ),
+    # Media connector implementation.
+    # - "http": Default connector that supports fetching media via HTTP.
+    #
+    # Custom implementations can be registered
+    # via `@MEDIA_CONNECTOR_REGISTRY.register("my_custom_media_connector")` and
+    # imported at runtime.
+    # If a non-existing backend is used, an AssertionError will be thrown.
+    "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
     # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
     # Default is 4 GiB per API process + 4 GiB per engine core process
     "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
@@ -766,7 +780,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set, the OpenAI API server will stay alive even after the underlying
     # AsyncLLMEngine errors and stops serving requests
     "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
-        os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
+        int(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", "0"))
     ),
     # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
     # the user to specify a max sequence length greater than
@@ -804,6 +818,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
         "VLLM_LORA_RESOLVER_CACHE_DIR", None
     ),
+    # Enables torch CUDA profiling if set.
+    # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
+    "VLLM_TORCH_CUDA_PROFILE": lambda: bool(
+        os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
+    ),
     # Enables torch profiler if set.
     # Both AsyncLLM's CPU traces as well as workers'
     # traces (CPU & GPU) will be saved under this directory.
@@ -908,8 +927,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Whether to use aiter rope.
     # By default is disabled.
-    "VLLM_ROCM_USE_TRITON_ROPE": lambda: (
-        os.getenv("VLLM_ROCM_USE_TRITON_ROPE", "False").lower() in ("true", "1")
+    "VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1")
     ),
     # Whether to use aiter triton fp8 bmm kernel
     # By default is enabled.
@@ -927,6 +946,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
         os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
         in ("true", "1")
     ),
+    # Whether to use aiter triton kernels for gemm ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() in ("true", "1")
+    ),
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (
         os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1")
@@ -1093,6 +1117,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Allow use of DeepGemm kernels for fused moe ops.
     "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))),
+    # Allow use of DeepGemm specifically for MoE fused ops (overrides only MoE).
+    "VLLM_MOE_USE_DEEP_GEMM": lambda: bool(
+        int(os.getenv("VLLM_MOE_USE_DEEP_GEMM", "1"))
+    ),
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
     "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
         int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
@@ -1207,7 +1235,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "throughput", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
     ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
@@ -1303,7 +1331,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # If set, it means we pre-downloaded cubin files and flashinfer will
     # read the cubin files directly.
-    "VLLM_HAS_FLASHINFER_CUBIN": lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
+    "VLLM_HAS_FLASHINFER_CUBIN": lambda: bool(
+        int(os.getenv("VLLM_HAS_FLASHINFER_CUBIN", "0"))
+    ),
     # Supported options:
     # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
     # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
@@ -1312,7 +1342,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
         "VLLM_NVFP4_GEMM_BACKEND",
         None,
-        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass"],
+        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"],
     ),
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
@@ -1439,9 +1469,23 @@ environment_variables: dict[str, Callable[[], Any]] = {
     #                                      top 5 collected objects
     "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""),
     # Disables parallel execution of shared_experts via separate cuda stream
-    "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: os.getenv(
-        "VLLM_DISABLE_SHARED_EXPERTS_STREAM", False
+    "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
+        int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
     ),
+    # Format for saving torch.compile cache artifacts
+    # - "binary": saves as binary file
+    #     Safe for multiple vllm serve processes accessing the same torch compile cache.
+    # - "unpacked": saves as directory structure (for inspection/debugging)
+    #     NOT multiprocess safe - race conditions may occur with multiple processes.
+    #     Allows viewing and setting breakpoints in Inductor's code output files.
+    "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
+        "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
+    ),
+    # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
+    # the original list[dict[int, Logprob]] approach.
+    # After enabled, PromptLogprobs and SampleLogprobs would populated as
+    # FlatLogprobs.
+    "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -1530,6 +1574,7 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_MOE_USE_DEEP_GEMM",
         "VLLM_USE_DEEP_GEMM_E8M0",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP16",
@@ -1550,9 +1595,10 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_MLA",
         "VLLM_ROCM_USE_AITER_MHA",
         "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_TRITON_ROPE",
+        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
         "VLLM_ROCM_USE_AITER_FP8BMM",
         "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
+        "VLLM_ROCM_USE_AITER_TRITON_GEMM",
         "VLLM_ROCM_USE_SKINNY_GEMM",
         "VLLM_ROCM_FP8_PADDING",
         "VLLM_ROCM_MOE_PADDING",
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index 21c886e0ad5eb..a34398db2c960 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -1,6 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
+import itertools
+from collections.abc import Iterable, Iterator, MutableSequence
+from dataclasses import dataclass, field
+from typing import overload
+
+import vllm.envs as envs
 
 
 # We use dataclass for now because it is used for
@@ -21,8 +26,183 @@ class Logprob:
     decoded_token: str | None = None
 
 
+LogprobsOnePosition = dict[int, Logprob]
+
+
+@dataclass
+class FlatLogprobs(MutableSequence[LogprobsOnePosition]):
+    """
+    Flat logprobs of a request into multiple primitive type lists.
+
+    Compared to list[dict[int, Logprob]], this data structure reduced GC
+    overhead significantly. As it flattened logprob information for
+    all positions and ranks in to multiple primitive type lists (i.e.
+    logprobs, token_ids, ranks per token_ids, decoded_tokens).
+    So regardless of the sequence length and top_logprobs setup,
+    FlatLogprobs would only introduce a constant amount of objects.
+
+    As each position might contains different amount of ranks,
+    start_indices_per_position would be used to access the logprob ranges
+    for different positions.
+
+    NOTE: To reduce the migration overhead and improve backward compatibility,
+    we support the key Sequence APIs of list, so it could act as
+    list[LogprobsOnePosition]
+    """
+
+    # Start / end indices to indicate the range of logprobs for each position.
+    start_indices: list[int] = field(default_factory=list)
+    end_indices: list[int] = field(default_factory=list)
+
+    # Flatten Logprob information for (each position, rank).
+    # For position <i>, the logprobs are ranged
+    # from self.start_indices[i] to self.end_indices[i] (exclusive).
+    token_ids: list[int] = field(default_factory=list)
+    logprobs: list[float] = field(default_factory=list)
+    ranks: list[int | None] = field(default_factory=list)
+    decoded_tokens: list[str | None] = field(default_factory=list)
+
+    def append(self, logprobs_one_position: LogprobsOnePosition | None) -> None:
+        """Appends the container with logprobs for the next position"""
+        self.start_indices.append(len(self.logprobs))
+        if logprobs_one_position:
+            for token_id, logprob in logprobs_one_position.items():
+                self.token_ids.append(token_id)
+                self.logprobs.append(logprob.logprob)
+                self.ranks.append(logprob.rank)
+                self.decoded_tokens.append(logprob.decoded_token)
+        self.end_indices.append(len(self.logprobs))
+
+    def append_fast(
+        self,
+        token_ids: list[int],
+        logprobs: list[float],
+        ranks: itertools.chain[int],
+        decoded_tokens: Iterable[str | None],
+    ) -> None:
+        """
+        Appends logprobs for the next position without creating
+        the intermediate logprob dictionary.
+        """
+        self.start_indices.append(len(self.logprobs))
+        for token_id, logprob, rank, decoded_token in zip(
+            token_ids, logprobs, ranks, decoded_tokens
+        ):
+            self.token_ids.append(token_id)
+            self.logprobs.append(logprob)
+            self.ranks.append(rank)
+            self.decoded_tokens.append(decoded_token)
+        self.end_indices.append(len(self.logprobs))
+
+    def extend(self, logprobs_multi_positions) -> None:
+        """Extends the container with logprobs for the next multiple positions"""
+        for logprobs_one_position in logprobs_multi_positions:
+            self.append(logprobs_one_position)
+
+    def __len__(self) -> int:
+        """Gets number of positions stored in the container"""
+        return len(self.start_indices)
+
+    @overload
+    def __getitem__(self, position: int) -> LogprobsOnePosition: ...
+
+    @overload
+    def __getitem__(self, s: slice, /) -> "FlatLogprobs": ...
+
+    def __getitem__(self, index: int | slice):
+        """Extracts logprobs of a given position or slice"""
+        if isinstance(index, int):
+            return {
+                self.token_ids[i]: Logprob(
+                    logprob=self.logprobs[i],
+                    rank=self.ranks[i],
+                    decoded_token=self.decoded_tokens[i],
+                )
+                for i in range(self.start_indices[index], self.end_indices[index])
+            }
+        elif isinstance(index, slice):
+            min_index = self.start_indices[index][0]
+            max_index = self.end_indices[index][-1]
+            return FlatLogprobs(
+                # Shift updated start_indices and end_indices to
+                # be 0-indexed
+                start_indices=[i - min_index for i in self.start_indices[index]],
+                end_indices=[i - min_index for i in self.end_indices[index]],
+                token_ids=self.token_ids[min_index:max_index],
+                logprobs=self.logprobs[min_index:max_index],
+                ranks=self.ranks[min_index:max_index],
+                decoded_tokens=self.decoded_tokens[min_index:max_index],
+            )
+        else:
+            raise TypeError(f"Invalid index type: {type(index)}")
+
+    def __setitem__(self, item, value) -> None:
+        raise TypeError("Cannot set logprobs in FlatLogprobs")
+
+    def __delitem__(self, item) -> None:
+        raise TypeError("Cannot delete logprobs from FlatLogprobs")
+
+    def insert(self, item) -> None:
+        raise TypeError("Cannot insert logprobs to FlatLogprobs")
+
+    def __iter__(self) -> Iterator[LogprobsOnePosition]:
+        """
+        Iterates the container and yields LogprobsOnePosition for
+        each position.
+        """
+        for i in range(0, len(self.start_indices)):
+            yield self.__getitem__(i)
+
+
 # {token_id -> logprob} per each sequence group. None if the corresponding
 # sequence group doesn't require prompt logprob.
-PromptLogprobs = list[dict[int, Logprob] | None]
+PromptLogprobs = FlatLogprobs | list[LogprobsOnePosition | None]
 # {token_id -> logprob} for each sequence group.
-SampleLogprobs = list[dict[int, Logprob]]
+SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition]
+
+
+def create_prompt_logprobs() -> PromptLogprobs:
+    """Creates a container to store prompt logprobs for a request"""
+    logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    # NOTE: logprob of first prompt token is None.
+    logprobs.append(None)
+    return logprobs
+
+
+def create_sample_logprobs() -> SampleLogprobs:
+    """Creates a container to store decode logprobs for a request"""
+    return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+
+
+def append_logprobs_for_next_position(
+    request_logprobs: PromptLogprobs | SampleLogprobs,
+    token_ids: list[int],
+    logprobs: list[float],
+    decoded_tokens: Iterable[str | None],
+    rank: int,
+    num_logprobs: int,
+) -> None:
+    """Appends logprobs for the next position"""
+    if num_logprobs == -1:
+        num_logprobs = len(logprobs)
+    # We do not need a special case for the sampled token
+    # being in the topk, since inserting duplicated data
+    # into a dictionary twice is the same as doing it once.
+    topk_ranks = range(1, num_logprobs + 1)
+    ranks = itertools.chain((rank,), topk_ranks)
+
+    if isinstance(request_logprobs, FlatLogprobs):
+        request_logprobs.append_fast(token_ids, logprobs, ranks, decoded_tokens)
+    else:
+        request_logprobs.append(
+            {
+                token_id: Logprob(
+                    logprob=logprob,
+                    rank=rank,
+                    decoded_token=token,
+                )
+                for token_id, logprob, rank, token in zip(
+                    token_ids, logprobs, ranks, decoded_tokens
+                )
+            }
+        )
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 275a2ed0c6813..dadb9e25ba2f1 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -13,6 +13,7 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_world_size,
 )
 from vllm.lora.layers.base import BaseLayerWithLoRA
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import (
     _get_config_dtype_str,
@@ -24,6 +25,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     modular_triton_fused_moe,
     try_get_optimal_moe_config,
 )
+from vllm.model_executor.layers.fused_moe.layer import FusedMoEModularMethod
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -39,6 +41,64 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.device = base_layer.w2_weight.device
         self._inject_lora_into_fused_moe()
 
+    def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
+        normalized_config = {}
+        for key, value in config.items():
+            if key.islower():
+                if key.startswith("block_"):
+                    normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper()
+                else:
+                    normalized_key = key.upper()
+            else:
+                normalized_key = key
+            normalized_config[normalized_key] = value
+        return normalized_config
+
+    def _get_lora_moe_configs(
+        self,
+        op_prefix: str,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        num_slices: int,
+        M: int,
+        layer: FusedMoE,
+        top_k: int,
+        config_dtype: str,
+    ):
+        if envs.VLLM_TUNED_CONFIG_FOLDER:
+            shrink_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_shrink",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+            expand_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_expand",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+        else:  # fall back to the default config
+            get_config_func = functools.partial(
+                try_get_optimal_moe_config,
+                layer.w13_weight.size(),
+                layer.w2_weight.size(),
+                top_k,
+                config_dtype,
+                block_shape=layer.quant_method.moe_quant_config.block_shape,
+            )
+            shrink_config = get_config_func(M)
+            expand_config = get_config_func(M)
+        shrink_config = self._normalize_keys(shrink_config)
+        expand_config = self._normalize_keys(expand_config)
+        return shrink_config, expand_config
+
     def _inject_lora_into_fused_moe(self):
         moe_state_dict = {}
         top_k = self.base_layer.top_k
@@ -90,17 +150,19 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
 
-                get_config_func = functools.partial(
-                    try_get_optimal_moe_config,
-                    layer.w13_weight.size(),
-                    layer.w2_weight.size(),
-                    top_k,
-                    config_dtype,
-                    block_shape=layer.quant_method.moe_quant_config.block_shape,
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w13",
+                    lora_a_stacked=self.w1_lora_a_stacked,
+                    lora_b_stacked=self.w1_lora_b_stacked,
+                    num_slices=2,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
                 )
 
+                # get the block size of m from customized config or default config
                 max_loras = self.w1_lora_a_stacked.shape[0]
-                config = get_config_func(M)
                 (
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -108,9 +170,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 ) = self.punica_wrapper.moe_lora_align_block_size(
                     curr_topk_ids,
                     num_tokens,
-                    config["BLOCK_SIZE_M"],
+                    shrink_config["BLOCK_SIZE_M"],
                     self.base_layer.local_num_experts,
                     max_loras,
+                    self.adapter_enabled,
                     expert_map,
                 )
 
@@ -137,7 +200,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     num_tokens_post_padded_lora,
                     max_lora_rank,
                     top_k,
-                    config,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
+                    self.adapter_enabled,
                 )
 
                 result = func(*args, **kwargs)
@@ -162,17 +227,17 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
 
-                get_config_func = functools.partial(
-                    try_get_optimal_moe_config,
-                    layer.w13_weight.size(),
-                    layer.w2_weight.size(),
-                    top_k,
-                    config_dtype,
-                    block_shape=layer.quant_method.moe_quant_config.block_shape,
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w2",
+                    lora_a_stacked=self.w2_lora_a_stacked,
+                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_slices=1,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
                 )
 
-                config = get_config_func(M)
-
                 sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"]
                 expert_ids_lora = moe_state_dict["expert_ids_lora"]
                 num_tokens_post_padded_lora = moe_state_dict[
@@ -195,7 +260,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     num_tokens_post_padded_lora,
                     max_lora_rank,
                     top_k,
-                    config,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
+                    self.adapter_enabled,
                     True,
                 )
 
@@ -214,10 +281,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             self.base_layer, fused_experts.moe_sum
         )
 
-        self.base_layer.quant_method.old_fused_experts = (
-            self.base_layer.quant_method.fused_experts
+        self.base_layer.quant_method = FusedMoEModularMethod(
+            self.base_layer.quant_method, m_fused_moe_fn
         )
-        self.base_layer.quant_method.fused_experts = m_fused_moe_fn
 
     def create_lora_weights(
         self,
@@ -227,6 +293,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
     ) -> None:
         """Initializes lora matrices."""
 
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
+        )
+
         self.w1_lora_a_stacked = torch.zeros(
             (
                 max_loras,
@@ -313,6 +383,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.w3_lora_b_stacked[index] = 0
         self.w2_lora_a_stacked[index] = 0
         self.w2_lora_b_stacked[index] = 0
+        self.adapter_enabled[index] = 0
 
     def set_lora(
         self,
@@ -322,8 +393,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         embeddings_tensor: torch.Tensor | None,
         bias: torch.Tensor | None = None,
     ):
-        self.reset_lora(index)
         """Overwrites lora tensors at index."""
+        self.reset_lora(index)
+        self.adapter_enabled[index] = 1
         for eid in range(len(lora_a) // 3):
             w1_lora_a = lora_a[eid * 3]
             w2_lora_a = lora_a[eid * 3 + 1]
diff --git a/vllm/lora/ops/triton_ops/README_TUNING.md b/vllm/lora/ops/triton_ops/README_TUNING.md
index fda95ea71891f..3ebe1fd7c3700 100644
--- a/vllm/lora/ops/triton_ops/README_TUNING.md
+++ b/vllm/lora/ops/triton_ops/README_TUNING.md
@@ -1,51 +1,60 @@
 # Multi-LoRA Tuning
 
-**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`. Without this, the shrink/expand kernels will use default configurations.
+**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`.
+Without this, the shrink/expand kernels will use default configurations.
 
 ## Tuning Process
 
-Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from [Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py).
+Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from
+[Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py).
 
-**Step 1**
-Define the searching space. An example searching space:
+1. Define the searching space. Here is an example of searching space:
 
-```python
-block_m_range = [16, 32, 64, 128, 256]
-block_n_range = [32, 64, 128, 256]
-block_k_range = [32, 64, 128, 256]
-num_warps_range = [4, 8]
-num_stage_range = [2, 3, 4, 5]
-num_ctas_range = [1]
-split_k_range = [4, 8, 16, 32, 64]
-```
+   ```python
+   block_m_range = [16, 32, 64, 128, 256]
+   block_n_range = [32, 64, 128, 256]
+   block_k_range = [32, 64, 128, 256]
+   num_warps_range = [4, 8]
+   num_stage_range = [2, 3, 4, 5]
+   num_ctas_range = [1]
+   split_k_range = [4, 8, 16, 32, 64]
+   ```
 
-**Step 2**
-Get all hidden_state sizes and num_slices that the target model uses for a specific TP size.
+2. Get all hidden_state sizes and num_slices that the target model uses for a specific TP size.
 
-For example, we can aquire those info by simply checking [add_lora_linear](https://github.com/li2haipeng/vllm/blob/multi_lora_v01011/vllm/lora/punica_wrapper/punica_gpu.py#L192):
+   For example, you can acquire the info by simply checking
+   [add_lora_linear](https://github.com/vllm-project/vllm/blob/main/vllm/lora/punica_wrapper/punica_gpu.py#L181):
 
-```python
-print(f"x_shape: {x.view(-1, x.shape[-1]).shape}")
-print(f"num_sclises: {len(output_slices)}")
-for i in range(len(output_slices)):
-    print(f"a{i} shape: {lora_a_stacked[i].shape}")
-    print(f"b{i} shape: {lora_b_stacked[i].shape}")
-print("y_shape", y.shape)
-```
+   ```python
+   print(f"x_shape: {x.view(-1, x.shape[-1]).shape}")
+   print(f"num_slices: {len(output_slices)}")
+   for i in range(len(output_slices)):
+       print(f"a{i} shape: {lora_a_stacked[i].shape}")
+       print(f"b{i} shape: {lora_b_stacked[i].shape}")
+   print("y_shape", y.shape)
+   ```
 
-**Step 3**
-Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space by performing a grid search to find the optimal kernel configuration. vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py) can be used to search for configurations for different shapes.
+3. Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space
+   by performing a grid search to find the optimal kernel configuration.
+   vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py)
+   can be used to search for configurations for different shapes.
 
 ## Config Files
 
-### File Name
+### File Naming
 
-For `shrink`, the config file is named as `{gpu_name}_SHRINK.json`, e.g. `NVIDIA_H200_SHRINK.json`.
+| Kernel Type               | File Name Template                          | Example                                     |
+|---------------------------|--------------------------------------------|---------------------------------------------|
+| shrink                    | `{gpu_name}_SHRINK.json`                   | `NVIDIA_H200_SHRINK.json`                  |
+| expand                    | `{gpu_name}_EXPAND_{add_input}.json`       | `NVIDIA_H200_EXPAND_TRUE.json`             |
+| fused_moe_lora_w13_shrink | `{gpu_name}_FUSED_MOE_LORA_W13_SHRINK.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_SHRINK.json` |
+| fused_moe_lora_w13_expand | `{gpu_name}_FUSED_MOE_LORA_W13_EXPAND.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_EXPAND.json` |
+| fused_moe_lora_w2_shrink  | `{gpu_name}_FUSED_MOE_LORA_W2_SHRINK.json`  | `NVIDIA_H200_FUSED_MOE_LORA_W2_SHRINK.json` |
+| fused_moe_lora_w2_expand  | `{gpu_name}_FUSED_MOE_LORA_W2_EXPAND.json`  | `NVIDIA_H200_FUSED_MOE_LORA_W2_EXPAND.json` |
 
-For `expand`, the config fileis named as `{gpu_name}_EXPAND_{add_input}.json`, e.g. `NVIDIA_H200_EXPAND_TRUE.json`.
+The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`.
 
-The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`
+### JSON Structure
 
-### Json Structure
-
-Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n]`
+Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n][i]`,
+where `i` is an optional dimension in the `fused_moe_lora` configuration, representing the intermediate size of the MoE layer.
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 436ea4ed00c82..7e8b9a79add39 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.triton_ops.fused_moe_lora_op import fused_moe_lora
+
+from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+    fused_moe_lora,
+    fused_moe_lora_expand,
+    fused_moe_lora_shrink,
+)
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
@@ -11,4 +16,6 @@ __all__ = [
     "lora_shrink",
     "LoRAKernelMeta",
     "fused_moe_lora",
+    "fused_moe_lora_shrink",
+    "fused_moe_lora_expand",
 ]
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 15031f5e2f9e8..6d6de2529de3d 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -6,6 +6,8 @@ import torch
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
+from .utils import supports_pdl
+
 _LORA_PTR_DICT: dict[tuple[int, ...], torch.tensor] = {}
 
 
@@ -54,6 +56,8 @@ def _fused_moe_lora_kernel(
     EM,
     num_valid_tokens,
     num_experts,
+    lora_ids,
+    adapter_enabled,
     # The stride variables represent how much to increase the ptr by when
     # moving by 1 element in a particular dimension. E.g. `stride_am` is
     # how much to increase `a_ptr` by to get the element one row down
@@ -80,10 +84,17 @@ def _fused_moe_lora_kernel(
     BLOCK_SIZE_K: tl.constexpr,
     GROUP_SIZE_M: tl.constexpr,
     SPLIT_K: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    IS_PRIMARY: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
     slice_id = tl.program_id(axis=1)
     lora_idx = tl.program_id(axis=2)
+    lora_id = tl.load(lora_ids + lora_idx)
+    moe_enabled = tl.load(adapter_enabled + lora_id)
+    if lora_id == -1 or moe_enabled == 0:
+        # Early exit for the no-lora case.
+        return
     max_loras = tl.num_programs(axis=2)
     grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
 
@@ -100,16 +111,14 @@ def _fused_moe_lora_kernel(
     pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m)
     pid_n = (pid_m_n % num_pid_in_group) // group_size_m
 
-    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_idx)
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_id)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-
     # get the expert_id to process curr shard
-    ind = lora_idx * stride_el + pid_m
+    ind = lora_id * stride_el + pid_m
     expert_id = tl.load(expert_ids_ptr + ind, ind < max_loras * stride_el, -1)
     if expert_id == -1:
         return
-
     # get a_ptr,b_ptr,c_ptr
     cur_a_ptr = a_ptr + (slice_id % num_slice_a) * slice_a_size
     cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
@@ -119,7 +128,7 @@ def _fused_moe_lora_kernel(
     offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
 
     offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
-    token_ind = stride_tl * lora_idx + offs_token_id
+    token_ind = stride_tl * lora_id + offs_token_id
     offs_token = tl.load(
         sorted_token_ids_ptr + token_ind, token_ind < max_loras * stride_tl, 0
     )
@@ -132,7 +141,7 @@ def _fused_moe_lora_kernel(
 
     b_ptrs = (
         cur_b_ptr
-        + lora_idx * stride_bl
+        + lora_id * stride_bl
         + expert_id * stride_be
         + offs_k[:, None] * stride_bk
         + offs_bn[None, :] * stride_bn
@@ -142,12 +151,17 @@ def _fused_moe_lora_kernel(
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, grid_k):
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
+        # pre-fetch lora weight
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        # GDC wait waits for ALL programs in the the prior kernel to complete
+        # before continuing.
+        if USE_GDC and not IS_PRIMARY:
+            tl.extra.cuda.gdc_wait()
         a = tl.load(
             a_ptrs,
             mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
             other=0.0,
         )
-        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
         accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
@@ -156,12 +170,15 @@ def _fused_moe_lora_kernel(
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
-
+    if USE_GDC and IS_PRIMARY:
+        # GDC launch dependents hints the runtime system to launch dependent kernels.
+        tl.extra.cuda.gdc_launch_dependents()
     accumulator = accumulator.to(c_ptr.dtype.element_ty)
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     c_ptrs = cur_c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+
     if SPLIT_K == 1:
         tl.store(c_ptrs, accumulator, mask=c_mask)
     else:
@@ -169,86 +186,52 @@ def _fused_moe_lora_kernel(
 
 
 @torch.inference_mode()
-def _fused_moe_lora(
-    output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
+def _fused_moe_lora_shrink(
+    a_intermediate_cache1: torch.Tensor,
+    # (num_slices, num_tokens, top_k_num, max_lora_rank)
     qcurr_hidden_states: torch.Tensor,  # (num_tokens, K,)
     lora_a_stacked: list[
         torch.Tensor
     ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
-    lora_b_stacked: list[
-        torch.Tensor
-    ],  # [(max_loras, num_experts, N, max_lora_rank,),...]
     topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
     sorted_token_ids: torch.Tensor,  # (max_loras, _)
     expert_ids: torch.Tensor,  # (max_loras, _ ,)
     num_tokens_post_padded: torch.Tensor,  # (max_loras, )
-    max_lora_rank: int,
     top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    ## adding for kernel
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
     block_size_m: int,
     block_size_n: int,
     block_size_k: int,
     group_size_m: int,
+    num_warps: int,
+    num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
 ) -> None:
-    assert len(lora_a_stacked) == len(lora_b_stacked) > 0
-    assert (
-        sorted_token_ids.dim()
-        == expert_ids.dim()
-        == topk_weights.dim()
-        == qcurr_hidden_states.dim()
-        == 2
-    )
-    assert (
-        sorted_token_ids.shape[0]
-        == expert_ids.shape[0]
-        == num_tokens_post_padded.shape[0]
-    )
-    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
-    assert output.shape[0] == topk_weights.shape[0]
-    assert top_k_num == topk_weights.shape[1]
-
-    for lora_a, lora_b in zip(lora_a_stacked, lora_b_stacked):
-        assert lora_a.dtype == lora_b.dtype == output.dtype == qcurr_hidden_states.dtype
-        assert lora_a.dtype in [torch.float16, torch.bfloat16]
-
-    device = qcurr_hidden_states.device
-    num_slices = len(lora_a_stacked)
-
-    config = {
+    w1_lora_a_stacked = lora_a_stacked[0]
+    use_gdc = supports_pdl(qcurr_hidden_states.device)
+    shrink_config = {
         "BLOCK_SIZE_M": block_size_m,
         "BLOCK_SIZE_N": block_size_n,
         "BLOCK_SIZE_K": block_size_k,
         "GROUP_SIZE_M": group_size_m,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
         "SPLIT_K": split_k,
+        "USE_GDC": use_gdc,
+        "launch_pdl": use_gdc,  # triton kernel metadata
     }
 
-    w1_lora_a_stacked = lora_a_stacked[0]
-    w1_lora_b_stacked = lora_b_stacked[0]
-    num_experts = lora_a_stacked[0].shape[1]
-
-    N = max_lora_rank
-    M = topk_weights.shape[0]
-    EM = sorted_token_ids.shape[1]
-    K = qcurr_hidden_states.shape[1]
-    num_tokens = M * top_k_num
-    w1_output_dim_size = w1_lora_b_stacked.shape[2]
-
-    lora_intermediate_cache1 = torch.empty(
-        (num_slices * M * top_k_num * (max_lora_rank + w1_output_dim_size)),
-        dtype=output.dtype,
-        device=device,
-    )
-
-    # slices
-    a_intermediate_size = num_slices * M * top_k_num * max_lora_rank
-    a_intermediate_cache1 = lora_intermediate_cache1[:a_intermediate_size].view(
-        num_slices, M, top_k_num, max_lora_rank
-    )
-    b_intermediate_cache1 = lora_intermediate_cache1[a_intermediate_size:].view(
-        num_slices, M, top_k_num, w1_output_dim_size
-    )
-
     b_ptr = _get_ptr(lora_a_stacked, device)
 
     grid = lambda META: (
@@ -258,7 +241,6 @@ def _fused_moe_lora(
         len(lora_a_stacked),
         lora_a_stacked[0].shape[0],
     )
-
     _fused_moe_lora_kernel[grid](
         qcurr_hidden_states,
         b_ptr,
@@ -272,6 +254,8 @@ def _fused_moe_lora(
         EM,
         num_tokens,
         num_experts,
+        lora_ids,
+        adapter_enabled,
         qcurr_hidden_states.stride(0),
         qcurr_hidden_states.stride(1),
         w1_lora_a_stacked.stride(0),
@@ -288,19 +272,73 @@ def _fused_moe_lora(
         num_slice_c=num_slices,
         top_k=1 if mul_routed_weight else top_k_num,
         MUL_ROUTED_WEIGHT=False,
-        **config,
+        IS_PRIMARY=True,
+        **shrink_config,
     )
 
+
+@torch.inference_mode()
+def _fused_moe_lora_expand(
+    output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
+    a_intermediate_cache1: torch.Tensor,  # (num_slices, M, top_k_num, max_lora_rank)
+    lora_b_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
+    topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
+    sorted_token_ids: torch.Tensor,  # (max_loras, _)
+    expert_ids: torch.Tensor,  # (max_loras, _ ,)
+    num_tokens_post_padded: torch.Tensor,  # (max_loras, )
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    ## adding for kernel
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    max_lora_rank: int,
+    w1_output_dim_size: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
     N = w1_output_dim_size
 
+    w1_lora_b_stacked = lora_b_stacked[0]
+
     a_intermediate_cache1 = a_intermediate_cache1.view(
         -1, a_intermediate_cache1.shape[3]
     )
 
-    # Set split_k = 1 for expand calls
-    config["SPLIT_K"] = 1
+    b_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, w1_output_dim_size),
+        dtype=output.dtype,
+        device=device,
+    )
+    use_gdc = supports_pdl(a_intermediate_cache1.device)
+    expand_config = {
+        "BLOCK_SIZE_M": block_size_m,
+        "BLOCK_SIZE_N": block_size_n,
+        "BLOCK_SIZE_K": block_size_k,
+        "GROUP_SIZE_M": group_size_m,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+        "SPLIT_K": split_k,  # Set split_k = 1 for expand calls
+        "USE_GDC": use_gdc,
+        "launch_pdl": use_gdc,  # triton kernel metadata
+    }
+
     grid = lambda META: (
         triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
         len(lora_b_stacked),
@@ -319,6 +357,8 @@ def _fused_moe_lora(
         EM,
         num_tokens,
         num_experts,
+        lora_ids,
+        adapter_enabled,
         a_intermediate_cache1.stride(0),
         a_intermediate_cache1.stride(1),
         w1_lora_b_stacked.stride(0),
@@ -335,12 +375,143 @@ def _fused_moe_lora(
         num_slice_c=num_slices,
         top_k=1,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
-        **config,
+        IS_PRIMARY=False,
+        **expand_config,
     )
     for i in range(num_slices):
         output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i]
 
 
+@torch.inference_mode()
+def _fused_moe_lora(
+    output: torch.Tensor,  # (num_tokens, top_k_num, N*len(lora_a_stacked),)
+    qcurr_hidden_states: torch.Tensor,  # (num_tokens, K,)
+    lora_a_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, max_lora_rank, K,),...]
+    lora_b_stacked: list[
+        torch.Tensor
+    ],  # [(max_loras, num_experts, N, max_lora_rank,),...]
+    topk_weights: torch.Tensor,  # (num_tokens, top_k_num)
+    sorted_token_ids: torch.Tensor,  # (max_loras, _)
+    expert_ids: torch.Tensor,  # (max_loras, _ ,)
+    num_tokens_post_padded: torch.Tensor,  # (max_loras, )
+    max_lora_rank: int,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    shrink_block_size_m: int,
+    shrink_block_size_n: int,
+    shrink_block_size_k: int,
+    shrink_group_size_m: int,
+    shrink_num_warps: int,
+    shrink_num_stages: int,
+    shrink_split_k: int,
+    expand_block_size_m: int,
+    expand_block_size_n: int,
+    expand_block_size_k: int,
+    expand_group_size_m: int,
+    expand_num_warps: int,
+    expand_num_stages: int,
+    expand_split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    assert len(lora_a_stacked) == len(lora_b_stacked) > 0
+    assert (
+        sorted_token_ids.dim()
+        == expert_ids.dim()
+        == topk_weights.dim()
+        == qcurr_hidden_states.dim()
+        == 2
+    )
+    assert (
+        sorted_token_ids.shape[0]
+        == expert_ids.shape[0]
+        == num_tokens_post_padded.shape[0]
+    )
+    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
+    assert output.shape[0] == topk_weights.shape[0]
+    assert top_k_num == topk_weights.shape[1]
+    device = qcurr_hidden_states.device
+    num_slices = len(lora_a_stacked)
+    w1_lora_b_stacked = lora_b_stacked[0]
+    num_experts = lora_a_stacked[0].shape[1]
+    N = max_lora_rank
+    M = topk_weights.shape[0]
+    EM = sorted_token_ids.shape[1]
+    K = qcurr_hidden_states.shape[1]
+    num_tokens = M * top_k_num
+    w1_output_dim_size = w1_lora_b_stacked.shape[2]
+
+    a_intermediate_cache1 = torch.zeros(
+        (num_slices, M, top_k_num, max_lora_rank),
+        dtype=output.dtype,
+        device=device,
+    )
+
+    _fused_moe_lora_shrink(
+        a_intermediate_cache1,
+        qcurr_hidden_states,
+        lora_a_stacked,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        top_k_num,
+        lora_ids,
+        adapter_enabled,
+        ## adding for kernel
+        device,
+        N,
+        M,
+        EM,
+        K,
+        num_tokens,
+        num_experts,
+        num_slices,
+        shrink_block_size_m,
+        shrink_block_size_n,
+        shrink_block_size_k,
+        shrink_group_size_m,
+        shrink_num_warps,
+        shrink_num_stages,
+        shrink_split_k,
+        mul_routed_weight,
+    )
+
+    _fused_moe_lora_expand(
+        output,
+        a_intermediate_cache1,
+        lora_b_stacked,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        top_k_num,
+        lora_ids,
+        adapter_enabled,
+        ## adding for kernel
+        device,
+        N,
+        M,
+        EM,
+        K,
+        num_tokens,
+        num_experts,
+        num_slices,
+        max_lora_rank,
+        w1_output_dim_size,
+        expand_block_size_m,
+        expand_block_size_n,
+        expand_block_size_k,
+        expand_group_size_m,
+        expand_num_warps,
+        expand_num_stages,
+        expand_split_k,
+        mul_routed_weight,
+    )
+
+
 def _fused_moe_lora_fake(
     output: torch.Tensor,
     qcurr_hidden_states: torch.Tensor,
@@ -352,10 +523,86 @@ def _fused_moe_lora_fake(
     num_tokens_post_padded: torch.Tensor,
     max_lora_rank: int,
     top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    shrink_block_size_m: int,
+    shrink_block_size_n: int,
+    shrink_block_size_k: int,
+    shrink_group_size_m: int,
+    shrink_num_warps: int,
+    shrink_num_stages: int,
+    shrink_split_k: int,
+    expand_block_size_m: int,
+    expand_block_size_n: int,
+    expand_block_size_k: int,
+    expand_group_size_m: int,
+    expand_num_warps: int,
+    expand_num_stages: int,
+    expand_split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    return
+
+
+def _fused_moe_lora_shrink_fake(
+    a_intermediate_cache1: torch.Tensor,
+    qcurr_hidden_states: torch.Tensor,
+    lora_a_stacked: list[torch.Tensor],
+    topk_weights: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
     block_size_m: int,
     block_size_n: int,
     block_size_k: int,
     group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
+    mul_routed_weight: bool = False,
+) -> None:
+    return
+
+
+def _fused_moe_lora_expand_fake(
+    output: torch.Tensor,
+    a_intermediate_cache1: torch.Tensor,
+    lora_b_stacked: list[torch.Tensor],
+    topk_weights: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    top_k_num: int,
+    lora_ids: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    device: torch.device,
+    N: int,
+    M: int,
+    EM: int,
+    K: int,
+    num_tokens: int,
+    num_experts: int,
+    num_slices: int,
+    max_lora_rank: int,
+    w1_output_dim_size: int,
+    block_size_m: int,
+    block_size_n: int,
+    block_size_k: int,
+    group_size_m: int,
+    num_warps: int,
+    num_stages: int,
+    split_k: int,
     mul_routed_weight: bool = False,
 ) -> None:
     return
@@ -368,7 +615,26 @@ try:
         mutates_args=["output"],
         fake_impl=_fused_moe_lora_fake,
     )
+
+    direct_register_custom_op(
+        op_name="fused_moe_lora_shrink",
+        op_func=_fused_moe_lora_shrink,
+        mutates_args=["a_intermediate_cache1"],
+        fake_impl=_fused_moe_lora_shrink_fake,
+    )
+
+    direct_register_custom_op(
+        op_name="fused_moe_lora_expand",
+        op_func=_fused_moe_lora_expand,
+        mutates_args=["output"],
+        fake_impl=_fused_moe_lora_expand_fake,
+    )
+
     fused_moe_lora = torch.ops.vllm.fused_moe_lora
+    fused_moe_lora_shrink = torch.ops.vllm.fused_moe_lora_shrink
+    fused_moe_lora_expand = torch.ops.vllm.fused_moe_lora_expand
 
 except AttributeError:
     fused_moe_lora = _fused_moe_lora
+    fused_moe_lora_shrink = _fused_moe_lora_shrink
+    fused_moe_lora_expand = _fused_moe_lora_expand
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index f6397a68ddb81..c6c2a02fdeb53 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -22,6 +22,8 @@ def mm_k(
     SPLIT_K: tl.constexpr,
     CAST_TYPE: tl.constexpr,
     b_dtype: tl.constexpr,
+    USE_GDC: tl.constexpr,
+    base_k,
 ):
     """
     Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of
@@ -45,27 +47,63 @@ def mm_k(
         CAST_TYPE: if True, cast the values from the A matrix to the B
           matrix dtype.
         b_dtype: datatype of the B matrix
+        USE_GDC: Whether to use PDL. True indicates use.
+        base_k: Base offset along K dimension for current SPLIT_K group
     """
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
+
+    # Step size along K for each iteration
+    STEP_K = BLOCK_K * SPLIT_K
+
+    # Total number of iterations (compile-time constant)
+    num_iters = tl.cdiv(K, STEP_K)
+
+    for k in range(num_iters):
+        # Current iteration's global K offset
+        iter_k = k * STEP_K + base_k
+
+        # Check if this iteration is completely valid (no masking needed)
+        block_end = iter_k + BLOCK_K
+
         if EVEN_K:
-            tiled_a = tl.load(a_ptr)
+            # K is divisible by BLOCK_K, no masking ever needed
+            # pre-fetch lora weight
             tiled_b = tl.load(b_ptr)
+            if USE_GDC:
+                tl.extra.cuda.gdc_wait()
+            tiled_a = tl.load(a_ptr)
+            if CAST_TYPE:
+                tiled_a = tiled_a.to(b_dtype)
+            accumulator += tl.dot(tiled_a, tiled_b)
         else:
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < K - k * (BLOCK_K * SPLIT_K), other=0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < K - k * (BLOCK_K * SPLIT_K), other=0
-            )
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(b_dtype)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * SPLIT_K * ak_stride
-        b_ptr += BLOCK_K * SPLIT_K * bk_stride
+            # Check if we need element-wise masking
+            if iter_k >= K:
+                # Entire block out of range, skip
+                pass
+            elif block_end <= K:
+                # Entire block in range, no masking needed (fast path)
+                tiled_b = tl.load(b_ptr)
+                if USE_GDC:
+                    tl.extra.cuda.gdc_wait()
+                tiled_a = tl.load(a_ptr)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+            else:
+                # Partial block, need masking (only last iteration)
+                k_offsets = tl.arange(0, BLOCK_K)
+                mask = iter_k + k_offsets < K
+                tiled_b = tl.load(b_ptr, mask=mask[:, None], other=0.0)
+                if USE_GDC:
+                    tl.extra.cuda.gdc_wait()
+                tiled_a = tl.load(a_ptr, mask=mask[None, :], other=0.0)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+
+        a_ptr += STEP_K * ak_stride
+        b_ptr += STEP_K * bk_stride
+
     return accumulator
 
 
@@ -102,6 +140,7 @@ def do_expand_kernel(
     EVEN_K: tl.constexpr,
     CAST_TYPE: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     """
     Given an array of integers that identifies the rows of A, ram,
@@ -154,6 +193,7 @@ def do_expand_kernel(
 
     # Compute the block matrix product.
     SPLIT_K = 1
+
     accumulator = mm_k(
         a_ptr,
         b_ptr,
@@ -168,6 +208,8 @@ def do_expand_kernel(
         SPLIT_K,
         CAST_TYPE,
         cur_lora_ptr.dtype.element_ty,
+        USE_GDC,
+        base_k=0,
     )
 
     tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
@@ -223,6 +265,7 @@ def do_shrink_kernel(
     EVEN_K: tl.constexpr,
     SPLIT_K: tl.constexpr,
     SLICE_NUM: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     """
     Given an array of integers that identifies the rows of A, ram,
@@ -272,8 +315,12 @@ def do_shrink_kernel(
         SPLIT_K,
         False,
         cur_lora_ptr.dtype.element_ty,
+        False,  # USE_GDC is always False in shrink kernel
+        base_k=pid_sk * BLOCK_K,
     )
-
+    # GDC launch dependents hints the runtime system to launch dependent kernels.
+    if USE_GDC:
+        tl.extra.cuda.gdc_launch_dependents()
     # Identify the C output pointers to store the results of the accumulator.
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_cm = tl.arange(0, BLOCK_M)
@@ -284,10 +331,10 @@ def do_shrink_kernel(
         + offset_cn[None, :] * output_d2_stride
     )
     c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N)
-
     accumulator *= scaling
+
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
         tl.store(c_ptr, accumulator, mask=c_mask)
     else:
-        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask, sem="relaxed")
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index fd4c1364de7ea..7f7d70cdc3a4a 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -14,6 +14,8 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
+from .utils import supports_pdl
+
 
 @triton.jit
 def _lora_expand_kernel(
@@ -45,6 +47,7 @@ def _lora_expand_kernel(
     CAST_TYPE: tl.constexpr,
     SLICE_NUM: tl.constexpr,
     SAME_STRIDE: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
@@ -121,6 +124,7 @@ def _lora_expand_kernel(
         EVEN_K,
         CAST_TYPE,
         ADD_INPUTS,
+        USE_GDC,
     )
 
 
@@ -236,7 +240,7 @@ def _lora_expand(
         # thread blocks simply exit.
         MAX_LORAS,
     )
-
+    use_gdc = supports_pdl(inputs.device)
     _lora_expand_kernel[grid](
         inputs,
         lora_ptr_tensor,
@@ -266,9 +270,11 @@ def _lora_expand(
         CAST_TYPE,
         NUM_SLICES,
         same_stride,
+        use_gdc,
         num_warps=NUM_WARPS,
         num_ctas=NUM_CTAS,
         num_stages=NUM_STAGES,
+        launch_pdl=use_gdc,
     )
 
     return
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index 8d126197f83ea..e78379cf684af 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -14,6 +14,8 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
+from .utils import supports_pdl
+
 
 @triton.jit
 def _lora_shrink_kernel(
@@ -41,15 +43,25 @@ def _lora_shrink_kernel(
     BLOCK_K: tl.constexpr,
     EVEN_K: tl.constexpr,
     SPLIT_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
     SLICE_NUM: tl.constexpr,
+    USE_GDC: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
 
     pid_sk_m_n = tl.program_id(axis=0)
     pid_sk = pid_sk_m_n % SPLIT_K
-    pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num
-    pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num
+
+    pid_m_n = pid_sk_m_n // SPLIT_K
+    num_pid_in_group = GROUP_SIZE_M * cta_n_num
+    group_id = pid_m_n // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(cta_m_num - first_pid_m, GROUP_SIZE_M)
+
+    # Column-major ordering within groups for better cache reuse
+    pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m)
+    pid_n = (pid_m_n % num_pid_in_group) // group_size_m
 
     slice_id = tl.program_id(axis=1)
     lora_idx = tl.program_id(axis=2)
@@ -74,7 +86,6 @@ def _lora_shrink_kernel(
     cta_lora_seq_indices = (
         token_indices_sorted_by_lora_ids + lora_m_indices_start + cta_m_offset
     )
-
     # Load all relevant row indices.
     offset_m = tl.arange(0, BLOCK_M) % cta_m_len
     ram = tl.load(cta_lora_seq_indices + offset_m)
@@ -109,6 +120,7 @@ def _lora_shrink_kernel(
         EVEN_K,
         SPLIT_K,
         SLICE_NUM,
+        USE_GDC,
     )
 
 
@@ -194,6 +206,7 @@ def _lora_shrink(
     NUM_WARPS = kernel_config["num_warps"]
     NUM_STAGES = kernel_config["num_stages"]
     NUM_CTAS = kernel_config["num_ctas"]
+    GROUP_SIZE_M = kernel_config.get("group_size_m", 8)
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
 
     # TODO (varun): This grid formulation maximizes parallelization at the
@@ -207,7 +220,7 @@ def _lora_shrink(
         # thread blocks exit early.
         MAX_LORAS,
     )
-
+    use_gdc = supports_pdl(inputs.device)
     _lora_shrink_kernel[grid](
         inputs,
         lora_ptr_tensor,
@@ -233,10 +246,13 @@ def _lora_shrink(
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
+        GROUP_SIZE_M,
         NUM_SLICES,
+        use_gdc,
         num_warps=NUM_WARPS,
         num_ctas=NUM_CTAS,
         num_stages=NUM_STAGES,
+        launch_pdl=use_gdc,
     )
 
     return
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 9ffb6dc3d85e5..8ed42382e3a86 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -3,6 +3,7 @@
 
 import functools
 import json
+from functools import lru_cache
 from pathlib import Path
 from typing import Any
 
@@ -10,6 +11,7 @@ import torch
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -154,13 +156,13 @@ def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
         gpu_name = gpu_name.replace("-", "_")
 
         config_fname = None
-        if op_type == "shrink":
-            config_fname = f"{gpu_name}_{op_type.upper()}.json"
-        else:
-            assert op_type == "expand"
+        # only expand op needs to consider add_inputs
+        if op_type == "expand":
             config_fname = (
                 f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json"
             )
+        else:
+            config_fname = f"{gpu_name}_{op_type.upper()}.json"
 
         config_path = Path(f"{user_defined_config_folder}/{config_fname}")
         if not config_path.exists():
@@ -186,8 +188,17 @@ def get_lora_op_configs(
     rank: int,
     num_slices: int,
     add_inputs: bool | None = None,
+    moe_intermediate_size: int | None = None,
 ) -> dict[str, int | None]:
-    assert op_type in ["shrink", "expand"]
+    # Add support for fused_moe_lora ops
+    assert op_type in [
+        "shrink",
+        "expand",
+        "fused_moe_lora_w13_shrink",
+        "fused_moe_lora_w13_expand",
+        "fused_moe_lora_w2_shrink",
+        "fused_moe_lora_w2_expand",
+    ]
 
     # default config
     default = {}
@@ -199,9 +210,26 @@ def get_lora_op_configs(
             "split_k": 64 if batch < 128 else 8,
             "num_warps": 4,
             "num_ctas": 1,
+            "group_size_m": 8,
             "num_stages": 2,
             "max_nreg": None,
         }
+    # The default config for fused_moe_lora ops
+    elif op_type in [
+        "fused_moe_lora_w13_shrink",
+        "fused_moe_lora_w13_expand",
+        "fused_moe_lora_w2_shrink",
+        "fused_moe_lora_w2_expand",
+    ]:
+        default = {
+            "block_m": 64,
+            "block_n": 64,
+            "block_k": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+            "group_size_m": 8,
+            "split_k": 1,
+        }
     else:
         default = {
             "block_m": 64,
@@ -246,5 +274,22 @@ def get_lora_op_configs(
         or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))]
     )
 
+    # slice by moe-intermediate-size if applicable
+    if moe_intermediate_size is not None:
+        i = moe_intermediate_size
+        config_data = (
+            config_data.get(str(i))
+            or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - i))]
+        )
+
     assert config_data is not None
     return config_data
+
+
+@lru_cache
+def supports_pdl(device: torch.device | None = None) -> bool:
+    """
+    Refer to: https://github.com/triton-lang/triton/blob/v3.5.0/python/tutorials/11-programmatic-dependent-launch.py
+    """
+    # PDL requires compute capability SM90 or above
+    return current_platform.is_cuda() and current_platform.has_device_capability(90)
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 5b4a18cf4789b..b6186e8561529 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -456,6 +456,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         block_size: int,
         num_experts: int,
         max_loras: int,
+        adapter_enabled: torch.Tensor,
         expert_map: torch.Tensor | None = None,
         pad_sorted_ids: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -478,7 +479,9 @@ class PunicaWrapperBase(PunicaWrapperABC):
         num_tokens_post_padded: torch.Tensor,
         max_lora_rank: int,
         top_k_num: int,
-        config,
+        shrink_config,
+        expand_config,
+        adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
     ):
         """
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index d9590769778ea..ede50a48af985 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -51,8 +51,12 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             self.max_loras, max_num_batched_tokens, device=device
         )
 
+        # When speculative decoding is enabled, max_num_samples is
+        # max_batches * (num_speculative_decoding_tokens + 1).
+        # This line can be optimized by replacing max_num_batched_tokens
+        # to  max_batches * (num_speculative_decoding_tokens + 1).
         self.prompt_mapping_meta = LoRAKernelMeta.make(
-            self.max_loras, max_batches, device=device
+            self.max_loras, max_num_batched_tokens, device=device
         )
 
     def update_metadata(
@@ -305,6 +309,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         block_size: int,
         num_experts: int,
         max_loras: int,
+        adapter_enabled: torch.Tensor,
         expert_map: torch.Tensor | None = None,
         pad_sorted_ids: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -331,7 +336,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             (max_loras), dtype=torch.int32, device=topk_ids.device
         )
 
-        (token_lora_mapping, _, _, _, _, _) = self.token_mapping_meta.meta_args(
+        (token_lora_mapping, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(
             num_tokens
         )
 
@@ -346,6 +351,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            adapter_enabled,
+            lora_ids,
         )
         if expert_map is not None:
             expert_ids = expert_map[expert_ids]
@@ -364,12 +371,15 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         num_tokens_post_padded: torch.Tensor,
         max_lora_rank: int,
         top_k_num: int,
-        config,
+        shrink_config,
+        expand_config,
+        adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
     ):
         """
         Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
         """
+        (_, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(x.size(0))
         fused_moe_lora(
             y,
             x,
@@ -381,10 +391,21 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             num_tokens_post_padded,
             max_lora_rank,
             top_k_num,
-            config["BLOCK_SIZE_M"],
-            config["BLOCK_SIZE_N"],
-            config["BLOCK_SIZE_K"],
-            config["GROUP_SIZE_M"],
-            config.get("SPLIT_K", 1),
+            lora_ids,
+            adapter_enabled,
+            shrink_config.get("BLOCK_SIZE_M", 64),
+            shrink_config.get("BLOCK_SIZE_N", 64),
+            shrink_config.get("BLOCK_SIZE_K", 32),
+            shrink_config.get("GROUP_SIZE_M", 8),
+            shrink_config.get("NUM_WARPS", 4),
+            shrink_config.get("NUM_STAGES", 3),
+            shrink_config.get("SPLIT_K", 1),
+            expand_config.get("BLOCK_SIZE_M", 64),
+            expand_config.get("BLOCK_SIZE_N", 64),
+            expand_config.get("BLOCK_SIZE_K", 32),
+            expand_config.get("GROUP_SIZE_M", 8),
+            expand_config.get("NUM_WARPS", 4),
+            expand_config.get("NUM_STAGES", 3),
+            expand_config.get("SPLIT_K", 1),
             mul_routed_weight,
         )
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 39e77b935d3d5..65babd10a948b 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -10,6 +10,7 @@ import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
@@ -736,11 +737,28 @@ def enable_batch_invariant_mode():
 
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
-    _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
-    _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
-    _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
-    _batch_invariant_LIB.impl("aten::bmm", bmm_batch_invariant, "CUDA")
-    _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+
+    # Batch invariant matmuls are no longer needed after cublas overrides
+    if not is_torch_equal_or_newer("2.10.0.dev"):
+        if current_platform.is_device_capability(100):
+            # For PyTorch 2.9, B200 uses GEMV for bs=1
+            # Requires https://github.com/pytorch/pytorch/pull/166735
+            _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
+            _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
+        else:
+            # Only source of batch invariance for Hopper is split-k, can disable through
+            # cuBLAS workspace config
+            _original_cublas_workspace_cfg = os.environ.get(
+                "CUBLAS_WORKSPACE_CONFIG", None
+            )
+            _original_cublaslt_workspace_size = os.environ.get(
+                "CUBLASLT_WORKSPACE_SIZE", None
+            )
+            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+            os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
+
     _batch_invariant_LIB.impl(
         "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA"
     )
@@ -749,6 +767,7 @@ def enable_batch_invariant_mode():
     _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, "CUDA")
 
     # Also monkeypatch torch.bmm directly as a fallback
+    _batch_invariant_LIB.impl("aten::bmm", bmm_batch_invariant, "CUDA")
     _original_torch_bmm = torch.bmm
     torch.bmm = bmm_batch_invariant
 
@@ -770,14 +789,6 @@ def enable_batch_invariant_mode():
     )
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
-        _original_cublaslt_workspace_size = os.environ.get(
-            "CUBLASLT_WORKSPACE_SIZE", None
-        )
-        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-        os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
-
 
 def disable_batch_invariant_mode():
     global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py
index 3a503981a8734..5a48e56a5fbbf 100644
--- a/vllm/model_executor/layers/fla/ops/utils.py
+++ b/vllm/model_executor/layers/fla/ops/utils.py
@@ -17,6 +17,7 @@ from typing import Any, Literal
 
 import torch
 
+from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 
 logger = logging.getLogger(__name__)
@@ -137,8 +138,8 @@ def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
 # For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
 # However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
 # Therefore, we need to check the triton backend to determine the actual GPU vendor.
-device = get_available_device() if get_available_device() != "hip" else "cuda"
-device_torch_lib = getattr(torch, device)
+device = "cuda" if current_platform.is_cuda_alike() else get_available_device()
+device_torch_lib = getattr(torch, device, None)
 device_platform = _check_platform()
 
 is_amd = device_platform == "amd"
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 095ec966ea7e4..b8a97e92ab790 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -100,6 +100,7 @@ def persistent_masked_m_silu_mul_quant(
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     num_parallel_tokens=16,
     group_size: int = 128,
+    use_ue8m0: bool | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
     y has shape (E, T, 2*H). The first half of the last dimension is
@@ -164,7 +165,7 @@ def persistent_masked_m_silu_mul_quant(
         device=y.device,
     )
 
-    use_ue8m0 = is_deep_gemm_e8m0_used()
+    use_ue8m0 = use_ue8m0 if use_ue8m0 is not None else is_deep_gemm_e8m0_used()
 
     cuda_arch = current_platform.get_device_capability(
         device_id=y.device.index
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 2394053329802..a7bd64b1c65e9 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from enum import IntEnum
 from typing import Optional, Union
 
 import torch
@@ -91,6 +92,26 @@ def _quant_flags_to_group_shape(
     return a_shape, w_shape
 
 
+# The type of method in top-K routing
+# Please keep this in sync with the counterpart defined in https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/trtllm/fused_moe/runner.h
+class RoutingMethodType(IntEnum):
+    # Default: Softmax -> TopK
+    Default = (0,)
+    # Renormalize: TopK -> Softmax
+    Renormalize = (1,)
+    # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups
+    # -> Top8 experts from the Top4 groups
+    DeepSeekV3 = (2,)
+    # Llama4: Top1 -> Sigmoid
+    Llama4 = (3,)
+    # RenormalizeNaive: Softmax -> TopK -> Renormalize
+    RenormalizeNaive = (4,)
+    # TopK: TopK (no softmax)
+    TopK = (5,)
+    # Unspecified
+    Unspecified = 6.0
+
+
 @dataclass
 class FusedMoEQuantDesc:
     """
@@ -463,6 +484,10 @@ def fp8_w8a8_moe_quant_config(
     per_act_token_quant: bool = False,
     per_out_ch_quant: bool = False,
     block_shape: list[int] | None = None,
+    a1_gscale: torch.Tensor | None = None,
+    a2_gscale: torch.Tensor | None = None,
+    g1_alphas: torch.Tensor | None = None,
+    g2_alphas: torch.Tensor | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for fp8 activations and fp8 weights.
@@ -470,9 +495,13 @@ def fp8_w8a8_moe_quant_config(
     return FusedMoEQuantConfig.make(
         torch.float8_e4m3fn,
         w1_scale=w1_scale,
+        g1_alphas=g1_alphas,
         w2_scale=w2_scale,
+        g2_alphas=g2_alphas,
         a1_scale=a1_scale,
+        a1_gscale=a1_gscale,
         a2_scale=a2_scale,
+        a2_gscale=a2_gscale,
         per_act_token_quant=per_act_token_quant,
         per_out_ch_quant=per_out_ch_quant,
         block_shape=block_shape,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..ee8a28b833d5a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000..09d3fa584edd8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
new file mode 100755
index 0000000000000..f5990fc1dd894
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
@@ -0,0 +1,213 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "768": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..fc6454ebfb2fe
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000..48997646d99b6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..381eb5d826a5b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000..c7df36e8b1740
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000..6825378d37989
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.4.0",
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 552d9e9cf88f3..23ace3408562a 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -5,6 +5,7 @@ from collections.abc import Callable
 import torch
 from torch.nn import functional as F
 
+from vllm import _custom_ops as ops
 from vllm import envs
 
 
@@ -237,7 +238,43 @@ class SGLFusedMOE:
 
 class CPUFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
-        pass
+        use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported()
+
+        num_experts = layer.w13_weight.size(0)
+        has_w13_bias = hasattr(layer, "w13_bias")
+        has_w2_bias = hasattr(layer, "w2_bias")
+
+        layer.gate_up_linear = []
+        layer.down_linear = []
+
+        for i in range(num_experts):
+            layer_w13_weight = layer.w13_weight[i]
+            layer_w13_bias = layer.w13_bias[i] if has_w13_bias else None
+            layer_w2_weight = layer.w2_weight[i]
+            layer_w2_bias = layer.w2_bias[i] if has_w2_bias else None
+            if use_onednn_mm:
+                gate_up_handle = ops.create_onednn_mm(layer_w13_weight.t(), 32)
+                layer.gate_up_linear.append(
+                    lambda x, handle=gate_up_handle, bias=layer_w13_bias: ops.onednn_mm(
+                        handle, x, bias
+                    )
+                )
+                down_handle = ops.create_onednn_mm(layer_w2_weight.t(), 32)
+                layer.down_linear.append(
+                    lambda x, handle=down_handle, bias=layer_w2_bias: ops.onednn_mm(
+                        handle, x, bias
+                    )
+                )
+            else:
+                layer.gate_up_linear.append(
+                    lambda x, w=layer_w13_weight, b=layer_w13_bias: F.linear(x, w, b)
+                )
+                layer.down_linear.append(
+                    lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b)
+                )
+        if use_onednn_mm:  # remove weight
+            layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
     def __call__(
         self,
@@ -287,8 +324,6 @@ class CPUFusedMOE:
 
         outputs = []
         start_idx = 0
-        has_w13_bias = hasattr(layer, "w13_bias")
-        has_w2_bias = hasattr(layer, "w2_bias")
 
         for i, num_tokens in enumerate(tokens_per_expert):
             end_idx = start_idx + num_tokens
@@ -296,19 +331,12 @@ class CPUFusedMOE:
                 continue
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
-            layer_w13_weight = layer.w13_weight[i]
-            layer_w13_bias = layer.w13_bias[i] if has_w13_bias else None
-            layer_w2_weight = layer.w2_weight[i]
-            layer_w2_bias = layer.w2_bias[i] if has_w2_bias else None
-
-            gate_up = F.linear(
-                tokens_for_this_expert, layer_w13_weight, bias=layer_w13_bias
-            )
+            gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
             if activation == "swigluoai":
                 gate_up = swigluoai_and_mul(gate_up)
             else:
                 gate_up = silu_and_mul(gate_up)
-            expert_out = F.linear(gate_up, layer_w2_weight, bias=layer_w2_bias)
+            expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 484b8aa9d107c..86cdd25f2c873 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -215,7 +215,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, max(N, K))
+        workspace1 = (M_sum, N)
         workspace2 = (M_sum, max(N // 2, K))
         output = (M, K)
         return (workspace1, workspace2, output)
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 051abbcb7949d..bc9aab5208d9a 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -170,7 +170,7 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
         self._apply_router_weight_on_input(
             a1, topk_weights, topk_ids, apply_router_weight_on_input
         )
-        if not self.use_dp:
+        if not self.use_dp and quant_config.quant_dtype == "nvfp4":
             return a1, None, None, topk_ids, topk_weights
 
         a1q, a1q_scale = moe_kernel_quantize_input(
@@ -181,11 +181,13 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
             quant_config.block_shape,
             is_fp4_scale_swizzled=not self.use_dp,
         )
-        topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
-            [topk_weights, topk_ids, a1q, a1q_scale],
-            dim=0,
-            sizes=get_local_sizes(),
-        )
+
+        if self.use_dp:
+            topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
+                [topk_weights, topk_ids, a1q, a1q_scale],
+                dim=0,
+                sizes=get_local_sizes(),
+            )
         if quant_config.quant_dtype == "nvfp4":
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
@@ -231,12 +233,13 @@ def flashinfer_alltoall_dispatch(
     max_num_token = (
         max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
     )
+    orig_topk_weights_dtype = topk_weights.dtype
     alltoall_info, topk_ids, topk_weights, _ = (
         MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
             topk_ids,
             topk_weights,
             None,
-            all2all_manager.prepare_workspace,
+            all2all_manager.prepare_workspace_tensor,
             max_num_token,
             ep_rank,
             ep_size,
@@ -245,6 +248,7 @@ def flashinfer_alltoall_dispatch(
             top_k,
         )
     )
+    topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
 
     x, x_sf = moe_kernel_quantize_input(
         x,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index f21fe16c5108e..51e06ac54f497 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     calculate_tile_tokens_dim,
@@ -23,26 +24,24 @@ def flashinfer_fused_moe_blockscale_fp8(
     w2_weight_scale_inv: torch.Tensor,
     global_num_experts: int,
     top_k: int,
-    num_expert_group: int,
-    topk_group: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
     intermediate_size: int,
     expert_offset: int,
     local_num_experts: int,
     block_shape: list[int],
-    routed_scaling: float = 1.0,
+    routing_method_type: int = RoutingMethodType.DeepSeekV3,
+    routed_scaling: float | None = 1.0,
 ) -> torch.Tensor:
     from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
 
+    topk_group = topk_group if topk_group is not None else 0
     assert top_k <= global_num_experts
-    assert top_k <= 8
-    assert topk_group <= 4
-    assert global_num_experts > num_expert_group
-    assert global_num_experts % num_expert_group == 0
+    assert top_k <= 10
     assert global_num_experts % 4 == 0
-    assert top_k < (topk_group * global_num_experts / num_expert_group)
     assert block_shape == [128, 128]
-    # Routing kernel expects #experts <= #threads 256
-    assert global_num_experts <= 256
+    # Routing kernel expects #experts <= #threads 512
+    assert global_num_experts <= 512
 
     a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
     # NOTE: scales of hidden states have to be transposed!
@@ -64,10 +63,8 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=calculate_tile_tokens_dim(
-            x.shape[0], top_k, global_num_experts
-        ),
-        routing_method_type=2,  # DeepSeek-styled routing method
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
         use_shuffled_weight=False,
     )
 
@@ -88,6 +85,7 @@ def flashinfer_fused_moe_blockscale_fp8_fake(
     expert_offset: int,
     local_num_experts: int,
     block_shape: list[int],
+    routing_method_type: int,
     routed_scaling: float = 1.0,
 ) -> torch.Tensor:
     return torch.empty_like(x)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index d0f5eb498127b..2e042d85fcfcf 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -14,6 +14,7 @@ import torch.nn.functional as F
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -55,8 +56,6 @@ from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
-from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
-
 logger = init_logger(__name__)
 
 
@@ -819,8 +818,8 @@ def get_config_file_name(
 ) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     # Set device_name to H200 if a device from the H200 family is detected
-    if "H200" in device_name:
-        device_name = "H200"
+    if "H200" in device_name.split("_"):
+        device_name = "NVIDIA_H200"
     dtype_selector = "" if not dtype else f",dtype={dtype}"
     block_shape_selector = (
         "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
@@ -1089,11 +1088,11 @@ def vllm_topk_softmax(
     return topk_weights, topk_indices
 
 
-def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
-    if is_rocm_aiter_moe_enabled():
-        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
-
-        return rocm_aiter_topk_softmax
+def dispatch_topk_func(
+    use_rocm_aiter: bool = False,
+) -> Callable[..., tuple[torch.Tensor, ...]]:
+    if use_rocm_aiter:
+        return rocm_aiter_ops.topk_softmax
     return vllm_topk_softmax
 
 
@@ -1121,7 +1120,7 @@ def fused_topk(
         M, topk, dtype=torch.int32, device=hidden_states.device
     )
 
-    topk_func = dispatch_topk_func()
+    topk_func = dispatch_topk_func(use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled())
     topk_weights, topk_ids = topk_func(
         topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
     )
@@ -1330,24 +1329,37 @@ def fused_grouped_topk(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
 
-    if scoring_func == "softmax":
+    if scoring_func == "sigmoid":
+        # Fully fused kernel path for sigmoid
+        topk_values, topk_indices = ops.grouped_topk(
+            gating_output,  # raw logits
+            num_expert_group,
+            topk_group,
+            topk,
+            renormalize,
+            routed_scaling_factor,
+            e_score_correction_bias.to(gating_output.dtype),
+            1,  # scoring_func=1 for sigmoid
+        )
+    elif scoring_func == "softmax":
+        # Apply softmax in Python, then use fused kernel
+        # TODO: Add support for softmax in kernel
         scores = torch.softmax(gating_output, dim=-1)
-    elif scoring_func == "sigmoid":
-        scores = gating_output.sigmoid()
+        topk_values, topk_indices = ops.grouped_topk(
+            scores,  # pre-computed scores
+            num_expert_group,
+            topk_group,
+            topk,
+            renormalize,
+            routed_scaling_factor,
+            e_score_correction_bias.to(gating_output.dtype),
+            0,  # scoring_func=0 (no activation, scores already computed)
+        )
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
-    scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
-    topk_values, topk_indices = ops.grouped_topk(
-        scores,
-        scores_with_bias.to(scores.dtype),
-        num_expert_group,
-        topk_group,
-        topk,
-        renormalize,
-        routed_scaling_factor,
-    )
-    return topk_values.to(torch.float32), topk_indices.to(torch.int32)
+    # Fused kernel outputs float32 values and int32 indices directly
+    return topk_values, topk_indices
 
 
 def inplace_fused_experts(
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 46d351b48c5e8..39547cc83c7b6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,6 +13,7 @@ import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.distributed import (
@@ -30,6 +31,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
     biased_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
@@ -41,8 +43,6 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 )
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
-    is_rocm_aiter_fusion_shared_expert_enabled,
-    is_rocm_aiter_moe_enabled,
 )
 from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator
 from vllm.model_executor.layers.quantization.base_config import (
@@ -92,13 +92,11 @@ else:
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
+from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+    rocm_aiter_grouped_topk,
+)
 
-if is_rocm_aiter_moe_enabled():
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-        rocm_aiter_grouped_topk as grouped_topk_aiter,
-    )
-else:
-    from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
 if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
 else:
@@ -117,10 +115,8 @@ class FusedMoeWeightScaleSupported(Enum):
 class FusedMoEMethodBase(QuantizeMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__()
-        self.moe = moe
+        self.moe: FusedMoEConfig = moe
         self.moe_quant_config: FusedMoEQuantConfig | None = None
-        self.fused_experts: FusedMoEModularKernel | None = None
-        self.topk_indices_dtype = None
 
     @abstractmethod
     def create_weights(
@@ -245,9 +241,9 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         else:
             return None
 
-    # Note: init_prepare_finalize should only be called by
-    # prepare_communication_buffer_for_model.
-    def init_prepare_finalize(self, layer: torch.nn.Module):
+    def maybe_init_modular_kernel(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEModularKernel | None:
         assert self.moe is not None
 
         # We must get the quant config here so that the layer is
@@ -261,17 +257,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             logger.debug(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
             )
-            assert self.topk_indices_dtype is None
-            assert self.fused_experts is None, (
-                f"Attempt to override experts for {id(self)}!"
-            )
-            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, layer)
-            self.fused_experts = FusedMoEModularKernel(
+            return FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
                 layer.shared_experts,
             )
+        else:
+            return None
 
     def select_gemm_impl(
         self,
@@ -292,8 +285,16 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         raise NotImplementedError
 
     @property
-    def using_modular_kernel(self) -> bool:
-        return self.fused_experts is not None
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
+
+    @property
+    def allow_inplace(self) -> bool:
+        return False
 
     @abstractmethod
     def apply(
@@ -322,13 +323,146 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         raise NotImplementedError
 
 
+@CustomOp.register("modular_fused_moe")
+class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    def __init__(
+        self,
+        old_quant_method: FusedMoEMethodBase,
+        fused_experts: FusedMoEModularKernel,
+    ):
+        super().__init__(old_quant_method.moe)
+        # Find better way to copy attributes?  Should we even copy attributes?
+        # self.__dict__.update(old_quant_method.__dict__)
+        self.moe_quant_config = old_quant_method.moe_quant_config
+        self.fused_experts = fused_experts
+        self.disable_expert_map = getattr(
+            old_quant_method,
+            "disable_expert_map",
+            not fused_experts.supports_expert_map(),
+        )
+        self.old_quant_method = old_quant_method
+        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
+
+    @property
+    def supports_eplb(self) -> bool:
+        return self.old_quant_method.supports_eplb
+
+    @property
+    def allow_inplace(self) -> bool:
+        return self.old_quant_method.allow_inplace
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return self.moe_quant_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        # Is getattr needed?
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        if enable_eplb:
+            if self.supports_eplb:
+                assert expert_load_view is not None
+                assert logical_to_physical_map is not None
+                assert logical_replica_count is not None
+                assert isinstance(layer, FusedMoE)
+            else:
+                raise NotImplementedError(
+                    "EPLB is not supported for "
+                    f"{self.old_quant_method.__class__.__name__}."
+                )
+
+        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+        )
+
+        result = self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=self.allow_inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else expert_map,
+        )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
+
+
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
         if self.rocm_aiter_moe_enabled:
             from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
 
@@ -378,6 +512,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 )
             self.flashinfer_cutlass_moe = None  # type: ignore
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
@@ -477,13 +619,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         # Padding the weight for better performance on ROCm
         layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
         layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
-        # Lazy import to avoid importing triton.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            shuffle_weights,
-        )
 
         if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = shuffle_weights(
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data
             )
 
@@ -499,10 +637,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         if current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex
 
+            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
             layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
                 layer.w13_weight,
                 layer.w2_weight,
                 use_prepack=True,
+                experts_start_id=ep_rank_start,
             )
         elif current_platform.is_cpu():
             from vllm.model_executor.layers.fused_moe import cpu_fused_moe
@@ -650,7 +790,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         )
 
         if self.rocm_aiter_moe_enabled:
-            assert self.fused_experts is None
             result = self.rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -671,21 +810,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
-        elif self.fused_experts is not None:
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-            )
         else:
-            assert fused_experts is not None
             result = fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -787,7 +912,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             or logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for XPU.")
-        assert custom_routing_function is None
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -796,6 +920,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize,
             topk_group,
             num_expert_group,
+            custom_routing_function=custom_routing_function,
         )
 
     def forward_tpu(
@@ -872,6 +997,7 @@ def determine_expert_map(
     global_num_experts: int,
     expert_placement_strategy: ExpertPlacementStrategy = "linear",
     num_fused_shared_experts: int = 0,
+    return_expert_mask: bool = False,
 ) -> tuple[int, torch.Tensor | None, torch.Tensor | None]:
     """
     Calculates how many experts should be assigned to each rank for EP and
@@ -934,7 +1060,7 @@ def determine_expert_map(
         )
 
     expert_mask = None
-    if is_rocm_aiter_moe_enabled():
+    if return_expert_mask:
         expert_mask = torch.ones(
             (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32
         )
@@ -1050,7 +1176,7 @@ class FusedMoE(CustomOp):
         hidden_size: Input hidden state size of the transformer
         intermediate_size: Intermediate size of the experts
         params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
+        reduce_results: Whether to all_reduce on the output of the layer
         renormalize: Whether to renormalize the logits in the fused_moe kernel
         quant_config: Quantization configure.
         enable_eplb: Whether to enable expert parallelism load balancer.
@@ -1088,6 +1214,7 @@ class FusedMoE(CustomOp):
         zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
+        routing_method_type: int | None = None,
     ):
         super().__init__()
 
@@ -1162,14 +1289,18 @@ class FusedMoE(CustomOp):
         self.logical_replica_count: torch.Tensor | None = None
 
         # ROCm aiter shared experts fusion
+        self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        self.aiter_fmoe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+
         self.num_fused_shared_experts = (
             n_shared_experts
-            if n_shared_experts is not None
-            and is_rocm_aiter_fusion_shared_expert_enabled()
+            if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled
             else 0
         )
         if (
-            not is_rocm_aiter_fusion_shared_expert_enabled()
+            not self.aiter_fmoe_shared_expert_enabled
             and self.num_fused_shared_experts != 0
         ):
             raise ValueError(
@@ -1216,6 +1347,7 @@ class FusedMoE(CustomOp):
                 global_num_experts=self.global_num_experts,
                 expert_placement_strategy=expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
@@ -1267,7 +1399,25 @@ class FusedMoE(CustomOp):
                 "Only softmax scoring function is supported for non-grouped topk."
             )
 
-        moe = FusedMoEConfig(
+        # ToDo: Better logic to determine the routing method type
+        if routing_method_type is not None:
+            self.routing_method_type = routing_method_type
+        else:
+            if scoring_func == "sigmoid":
+                if self.use_grouped_topk:
+                    self.routing_method_type = RoutingMethodType.DeepSeekV3
+                elif self.top_k == 1:
+                    self.routing_method_type = RoutingMethodType.Llama4
+            elif self.scoring_func == "softmax":
+                self.routing_method_type = (
+                    RoutingMethodType.Renormalize
+                    if not self.renormalize
+                    else RoutingMethodType.RenormalizeNaive
+                )
+            else:
+                self.routing_method_type = RoutingMethodType.TopK
+
+        self.moe_config: FusedMoEConfig = FusedMoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
@@ -1279,24 +1429,26 @@ class FusedMoE(CustomOp):
             is_act_and_mul=is_act_and_mul,
             is_lora_enabled=vllm_config.lora_config is not None,
         )
-        self.moe_config: FusedMoEConfig = moe
+
         self.moe_quant_config: FusedMoEQuantConfig | None = None
         self.quant_config = quant_config
 
+        def _get_quant_method() -> FusedMoEMethodBase:
+            """
+            Helper method to ensure self.quant_method is never None and
+            of the proper type.
+            """
+            quant_method = None
+            if self.quant_config is not None:
+                quant_method = self.quant_config.get_quant_method(self, prefix)
+            if quant_method is None:
+                quant_method = UnquantizedFusedMoEMethod(self.moe_config)
+            assert isinstance(quant_method, FusedMoEMethodBase)
+            return quant_method
+
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
-        quant_method: QuantizeMethodBase | None = None
-        quant_method = (
-            UnquantizedFusedMoEMethod(moe)
-            if quant_config is None
-            else quant_config.get_quant_method(self, prefix)
-        )
-        if quant_method is None:
-            quant_method = UnquantizedFusedMoEMethod(moe)
-
-        assert quant_method is not None
-        assert isinstance(quant_method, FusedMoEMethodBase)
-        self.quant_method = quant_method
+        self.quant_method: FusedMoEMethodBase = _get_quant_method()
 
         if not self.moe_config.is_act_and_mul:
             # Avoid circular import
@@ -1305,7 +1457,7 @@ class FusedMoE(CustomOp):
             )
 
             if not isinstance(
-                quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
             ):
                 raise NotImplementedError(
                     "is_act_and_mul=False is supported only for unquantized "
@@ -1316,20 +1468,18 @@ class FusedMoE(CustomOp):
                     "is_act_and_mul=False is supported only for CUDA for now"
                 )
 
-        if self.enable_eplb:
-            from vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod
-
-            if not isinstance(quant_method, (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
-                # TODO: Add support for additional quantization methods.
-                # The implementation for other quantization methods does not
-                # contain essential differences, but the current quant API
-                # design causes duplicated work when extending to new
-                # quantization methods, so I'm leaving it for now.
-                # If you plan to add support for more quantization methods,
-                # please refer to the implementation in `Fp8MoEMethod`.
-                raise NotImplementedError(
-                    "EPLB is only supported for FP8 quantization for now."
-                )
+        if self.enable_eplb and not self.quant_method.supports_eplb:
+            # TODO: Add support for additional quantization methods.
+            # The implementation for other quantization methods does not
+            # contain essential differences, but the current quant API
+            # design causes duplicated work when extending to new
+            # quantization methods, so I'm leaving it for now.
+            # If you plan to add support for more quantization methods,
+            # please refer to the implementation in `Fp8MoEMethod`.
+            raise NotImplementedError(
+                f"EPLB is not supported {self.quant_method.__class__.__name__}. "
+                "EPLB is only supported for FP8 quantization for now."
+            )
 
         moe_quant_params = {
             "num_experts": self.local_num_experts,
@@ -1353,6 +1503,15 @@ class FusedMoE(CustomOp):
         self.batched_hidden_states: torch.Tensor | None = None
         self.batched_router_logits: torch.Tensor | None = None
 
+    # Note: maybe_init_modular_kernel should only be called by
+    # prepare_communication_buffer_for_model.
+    # This is called after all weight loading and post-processing, so it
+    # should be safe to swap out the quant_method.
+    def maybe_init_modular_kernel(self) -> None:
+        mk = self.quant_method.maybe_init_modular_kernel(self)
+        if mk is not None:
+            self.quant_method = FusedMoEModularMethod(self.quant_method, mk)
+
     @property
     def shared_experts(self) -> torch.nn.Module | None:
         return None
@@ -1431,13 +1590,16 @@ class FusedMoE(CustomOp):
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
                 num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
-            self._init_aiter_shared_experts_topK_buffer(
-                vllm_config=get_current_vllm_config(), dp_size=get_dp_group().world_size
-            )
+            if self.aiter_fmoe_shared_expert_enabled:
+                self._init_aiter_shared_experts_topK_buffer(
+                    vllm_config=get_current_vllm_config(),
+                    dp_size=get_dp_group().world_size,
+                )
 
     def _load_per_tensor_weight_scale(
         self,
@@ -1614,20 +1776,19 @@ class FusedMoE(CustomOp):
     def _init_aiter_shared_experts_topK_buffer(
         self, vllm_config: VllmConfig, dp_size: int
     ):
-        if is_rocm_aiter_fusion_shared_expert_enabled():
-            if self.num_fused_shared_experts > 0:
-                init_aiter_topK_meta_data(
-                    n_routed_experts=self.global_num_experts,
-                    n_shared_experts=self.num_fused_shared_experts,
-                    top_k=self.top_k,
-                    tp_rank=self.ep_rank if self.use_ep else self.tp_rank,
-                    tp_size=self.ep_size if self.use_ep else self.tp_size,
-                    shared_experts_score=1.0,
-                    max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens
-                    * dp_size,
-                    is_EP=self.use_ep,
-                )
-            self.local_num_experts += self.num_fused_shared_experts
+        if self.num_fused_shared_experts > 0:
+            init_aiter_topK_meta_data(
+                n_routed_experts=self.global_num_experts,
+                n_shared_experts=self.num_fused_shared_experts,
+                top_k=self.top_k,
+                tp_rank=self.ep_rank if self.use_ep else self.tp_rank,
+                tp_size=self.ep_size if self.use_ep else self.tp_size,
+                shared_experts_score=1.0,
+                max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens
+                * dp_size,
+                is_EP=self.use_ep,
+            )
+        self.local_num_experts += self.num_fused_shared_experts
 
     @overload
     def weight_loader(
@@ -2066,18 +2227,19 @@ class FusedMoE(CustomOp):
             )
 
         # DeepSeekv2 uses grouped_top_k
-        if use_grouped_topk:
+        elif use_grouped_topk:
             assert topk_group is not None
             assert num_expert_group is not None
-            if is_rocm_aiter_moe_enabled():
-                if not is_rocm_aiter_fusion_shared_expert_enabled():
+            if rocm_aiter_ops.is_fused_moe_enabled():
+                if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
                     assert num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
-                    grouped_topk_aiter,
+                    rocm_aiter_grouped_topk,
                     num_fused_shared_experts=num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
+
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
@@ -2167,7 +2329,7 @@ class FusedMoE(CustomOp):
         """
         assert self.quant_method is not None
         return (
-            self.quant_method.fused_experts is not None
+            isinstance(self.quant_method, FusedMoEModularMethod)
             and self.quant_method.fused_experts.output_is_reduced()
         )
 
@@ -2194,6 +2356,16 @@ class FusedMoE(CustomOp):
                 value=0.0,
             )
 
+        def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            if (
+                not self.is_sequence_parallel
+                and not self.use_dp_chunking
+                and self.reduce_results
+                and (self.tp_size > 1 or self.ep_size > 1)
+            ):
+                states = self.maybe_all_reduce_tensor_model_parallel(states)
+            return states
+
         if self.shared_experts is None:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -2204,7 +2376,14 @@ class FusedMoE(CustomOp):
                 fused_output = torch.ops.vllm.moe_forward(
                     hidden_states, router_logits, self.layer_name
                 )
-            return fused_output[..., :og_hidden_states]
+            if self.zero_expert_num is not None and self.zero_expert_num > 0:
+                assert isinstance(fused_output, tuple)
+                fused_output, zero_expert_result = fused_output
+                return (reduce_output(fused_output) + zero_expert_result)[
+                    ..., :og_hidden_states
+                ]
+            else:
+                return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -2217,8 +2396,8 @@ class FusedMoE(CustomOp):
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                shared_output[..., :og_hidden_states],
-                fused_output[..., :og_hidden_states],
+                reduce_output(shared_output)[..., :og_hidden_states],
+                reduce_output(fused_output)[..., :og_hidden_states],
             )
 
     def forward_cuda(
@@ -2277,28 +2456,6 @@ class FusedMoE(CustomOp):
             staged_hidden_states.copy_(hidden_states, non_blocking=True)
             staged_router_logits.copy_(router_logits, non_blocking=True)
 
-            # If there are shared experts but we are not using a modular kernel,
-            # the shared experts must be called here
-            if has_separate_shared_experts:
-                assert self.shared_experts is not None
-
-                if self.shared_experts_stream is not None:
-                    # For chunked, we start the shared experts stream here
-                    # (Note that no concurrency with the router/gate)
-                    self.shared_experts_stream.wait_stream(current_stream())
-
-                    with torch.cuda.stream(self.shared_experts_stream):
-                        # Note that staged_hidden_states clone() is necessary
-                        # here to avoid conflict with the main stream
-                        shared_output = self.shared_experts(
-                            staged_hidden_states.clone()
-                        )
-                else:
-                    shared_output = self.shared_experts(staged_hidden_states)
-
-            else:
-                shared_output = None
-
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
@@ -2309,7 +2466,7 @@ class FusedMoE(CustomOp):
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
                 expert_map=self.expert_map
-                if not is_rocm_aiter_moe_enabled()
+                if not self.rocm_aiter_fmoe_enabled
                 else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
@@ -2327,11 +2484,7 @@ class FusedMoE(CustomOp):
             if has_separate_shared_experts:
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
-
-                # Here we finish the shared experts stream
-                if self.shared_experts_stream is not None:
-                    current_stream().wait_stream(self.shared_experts_stream)
-
+                shared_output = self.shared_experts(staged_hidden_states)
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,
@@ -2403,7 +2556,7 @@ class FusedMoE(CustomOp):
         self.ensure_dp_chunking_init()
 
         has_separate_shared_experts = (
-            not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel)
+            not isinstance(self.quant_method, FusedMoEModularMethod)
             and self.shared_experts is not None
         )
 
@@ -2430,8 +2583,8 @@ class FusedMoE(CustomOp):
                 hidden_states, router_logits, has_separate_shared_experts
             )
 
-        do_naive_dispatch_combine: bool = (
-            self.dp_size > 1 and not self.quant_method.using_modular_kernel
+        do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance(
+            self.quant_method, FusedMoEModularMethod
         )
 
         # If there are shared experts but we are not using a modular kernel, the
@@ -2440,11 +2593,22 @@ class FusedMoE(CustomOp):
             assert self.shared_experts is not None
 
             if self.shared_experts_stream is not None:
+                # Clone BEFORE switching streams to avoid race condition
+                # where routed_expert kernel may mutate hidden_states.
+                hidden_states_clone = hidden_states.clone()
+                self.shared_experts_stream.wait_stream(current_stream())
+
                 # Run shared experts in parallel on a separate stream
                 with torch.cuda.stream(self.shared_experts_stream):
-                    # Note that hidden_states clone() is necessary here to avoid
-                    # conflict with the main stream
-                    shared_output = self.shared_experts(hidden_states.clone())
+                    shared_output = self.shared_experts(hidden_states_clone)
+
+                # Record that the clone will be used by shared_experts_stream
+                # to avoid gc issue from deallocation of hidden_states_clone
+                # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+                # NOTE: we dont need shared_output.record_stream(current_stream())
+                # because we synch the streams before using shared_output.
+                hidden_states_clone.record_stream(self.shared_experts_stream)
+
             else:
                 shared_output = self.shared_experts(hidden_states)
         else:
@@ -2473,7 +2637,7 @@ class FusedMoE(CustomOp):
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
                 expert_map=self.expert_map
-                if not is_rocm_aiter_moe_enabled()
+                if not self.rocm_aiter_fmoe_enabled
                 else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
@@ -2505,31 +2669,21 @@ class FusedMoE(CustomOp):
                 assert isinstance(final_hidden_states, tuple)
                 final_hidden_states, zero_expert_result = final_hidden_states
 
-            def reduce_output(
-                states: torch.Tensor, do_combine: bool = True
-            ) -> torch.Tensor:
-                if do_naive_dispatch_combine and do_combine:
+            def combine_output(states: torch.Tensor) -> torch.Tensor:
+                if do_naive_dispatch_combine:
                     states = get_ep_group().combine(states, self.is_sequence_parallel)
-
-                if (
-                    not self.is_sequence_parallel
-                    and self.reduce_results
-                    and (self.tp_size > 1 or self.ep_size > 1)
-                ):
-                    states = self.maybe_all_reduce_tensor_model_parallel(states)
-
                 return states
 
             if self.shared_experts is not None:
                 return (
-                    reduce_output(final_hidden_states[0], do_combine=False),
-                    reduce_output(final_hidden_states[1]),
+                    final_hidden_states[0],
+                    combine_output(final_hidden_states[1]),
                 )
             elif self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(final_hidden_states, torch.Tensor)
-                return reduce_output(final_hidden_states) + zero_expert_result
+                return (combine_output(final_hidden_states), zero_expert_result)
             else:
-                return reduce_output(final_hidden_states)
+                return combine_output(final_hidden_states)
 
     @classmethod
     def make_expert_params_mapping(
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 3b5916f8ccaf8..b5fa2c71bec58 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -707,6 +707,12 @@ class FusedMoEModularKernel(torch.nn.Module):
             f"{fused_experts.activation_formats[0]}"
         )
 
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps.
+        """
+        return self.fused_experts.supports_expert_map()
+
     def output_is_reduced(self) -> bool:
         """
         Indicates whether or not the output of fused MoE kernel
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index e18514ad43f6d..8f05828d74f5f 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,17 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
-from functools import cache, lru_cache
+from functools import lru_cache
 
 import torch
 
-from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
 
 
 class QuantMethod(IntEnum):
@@ -37,27 +35,6 @@ class ActivationMethod(IntEnum):
     GELU = 1
 
 
-@cache
-def is_rocm_aiter_moe_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER_MOE
-        and envs.VLLM_ROCM_USE_AITER
-    )
-
-
-@cache
-def use_mxfp4_aiter_moe() -> bool:
-    return current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
-
-
-@cache
-def is_rocm_aiter_fusion_shared_expert_enabled() -> bool:
-    return (
-        envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS and is_rocm_aiter_moe_enabled()
-    )
-
-
 aiter_topK_meta_data = None
 
 
@@ -114,250 +91,6 @@ def init_aiter_topK_meta_data(
     aiter_topK_meta_data = (total_topk_weights, total_topk_ids)
 
 
-def rocm_aiter_asm_moe_tkw1_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: torch.Tensor | None = None,
-    fc2_scale: torch.Tensor | None = None,
-    fc1_smooth_scale: torch.Tensor | None = None,
-    fc2_smooth_scale: torch.Tensor | None = None,
-    a16: bool = False,
-    per_tensor_quant_scale: torch.Tensor | None = None,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-) -> torch.Tensor:
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
-
-    activation = ActivationType(activation_method)
-
-    return asm_moe_tkw1(
-        hidden_states,
-        w1,
-        w2,
-        topk_weights,
-        topk_ids,
-        fc1_scale=fc1_scale,
-        fc2_scale=fc2_scale,
-        fc1_smooth_scale=fc1_smooth_scale,
-        fc2_smooth_scale=fc2_smooth_scale,
-        a16=a16,
-        per_tensor_quant_scale=per_tensor_quant_scale,
-        expert_mask=expert_mask,
-        activation=activation,
-    )
-
-
-def rocm_aiter_asm_moe_tkw1_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: torch.Tensor | None = None,
-    fc2_scale: torch.Tensor | None = None,
-    fc1_smooth_scale: torch.Tensor | None = None,
-    fc2_smooth_scale: torch.Tensor | None = None,
-    a16: bool = False,
-    per_tensor_quant_scale: torch.Tensor | None = None,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_topk_softmax_impl(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> None:
-    from aiter import topk_softmax
-
-    topk_softmax(
-        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
-    )
-
-
-def rocm_aiter_topk_softmax_fake(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> None:
-    pass
-
-
-def rocm_aiter_biased_grouped_topk_impl(
-    gating_output: torch.Tensor,
-    correction_bias: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    from aiter import biased_grouped_topk
-
-    biased_grouped_topk(
-        gating_output,
-        correction_bias,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        need_renorm,
-        routed_scaling_factor,
-    )
-
-
-def rocm_aiter_biased_grouped_topk_fake(
-    gating_output: torch.Tensor,
-    correction_bias: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_grouped_topk_impl(
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    scoring_func: str = "softmax",
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    from aiter import grouped_topk
-
-    grouped_topk(
-        gating_output,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        need_renorm,
-        scoring_func,
-        routed_scaling_factor,
-    )
-
-
-def rocm_aiter_grouped_topk_fake(
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    scoring_func: str = "softmax",
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_fused_moe_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: torch.Tensor | None = None,
-    w2_scale: torch.Tensor | None = None,
-    a1_scale: torch.Tensor | None = None,
-    a2_scale: torch.Tensor | None = None,
-) -> torch.Tensor:
-    from aiter import ActivationType, QuantType
-    from aiter.fused_moe import fused_moe
-
-    activation = ActivationType(activation_method)
-    quant_type = QuantType(quant_method)
-
-    return fused_moe(
-        hidden_states,
-        w1,
-        w2,
-        topk_weight,
-        topk_ids,
-        expert_mask,
-        activation,
-        quant_type,
-        doweight_stage1,
-        w1_scale,
-        w2_scale,
-        a1_scale,
-        a2_scale,
-    )
-
-
-def rocm_aiter_fused_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: torch.Tensor | None = None,
-    w2_scale: torch.Tensor | None = None,
-    a1_scale: torch.Tensor | None = None,
-    a2_scale: torch.Tensor | None = None,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_asm_moe_tkw1",
-        op_func=rocm_aiter_asm_moe_tkw1_impl,
-        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_fused_moe",
-        op_func=rocm_aiter_fused_moe_impl,
-        fake_impl=rocm_aiter_fused_moe_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_topk_softmax",
-        op_func=rocm_aiter_topk_softmax_impl,
-        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
-        fake_impl=rocm_aiter_topk_softmax_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_biased_grouped_topk",
-        op_func=rocm_aiter_biased_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_biased_grouped_topk_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_grouped_topk",
-        op_func=rocm_aiter_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_grouped_topk_fake,
-    )
-
-
 def rocm_aiter_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -372,7 +105,10 @@ def rocm_aiter_grouped_topk(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
     device = hidden_states.device
-    if is_rocm_aiter_fusion_shared_expert_enabled() and num_fused_shared_experts > 0:
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
         assert aiter_topK_meta_data is not None, (
             "AITER topK meta data is not initialized. "
             "Please ensure that init_aiter_topK_meta_data "
@@ -397,7 +133,7 @@ def rocm_aiter_grouped_topk(
         topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     if e_score_correction_bias is not None:
-        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+        rocm_aiter_ops.biased_grouped_topk(
             gating_output,
             e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
@@ -409,7 +145,7 @@ def rocm_aiter_grouped_topk(
         )
     else:
         assert scoring_func == "softmax" or scoring_func == "sigmoid"
-        torch.ops.vllm.rocm_aiter_grouped_topk(
+        rocm_aiter_ops.grouped_topk(
             gating_output,
             topk_weights,
             topk_ids,
@@ -420,7 +156,10 @@ def rocm_aiter_grouped_topk(
             routed_scaling_factor=routed_scaling_factor,
         )
 
-    if is_rocm_aiter_fusion_shared_expert_enabled() and num_fused_shared_experts > 0:
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
         return total_topk_weights, total_topk_ids
     return topk_weights, topk_ids
 
@@ -464,7 +203,7 @@ def rocm_aiter_fused_experts(
             "Only support topk=1 when `apply_router_weight_on_input` is True"
         )
 
-        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+        return rocm_aiter_ops.asm_moe_tkw1(
             hidden_states,
             w1,
             w2,
@@ -482,7 +221,9 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-
+        # quark moe for mxfp4 w_dtype
+        if quant_config.use_mxfp4_w4a16:
+            quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
             assert not apply_router_weight_on_input, (
@@ -507,7 +248,7 @@ def rocm_aiter_fused_experts(
                 "Only support topk=1 when `apply_router_weight_on_input` is True"
             )
 
-        return torch.ops.vllm.rocm_aiter_fused_moe(
+        return rocm_aiter_ops.fused_moe(
             hidden_states,
             w1,
             w2,
@@ -522,39 +263,3 @@ def rocm_aiter_fused_experts(
             a2_scale=quant_config.a2_scale,
             doweight_stage1=apply_router_weight_on_input,
         )
-
-
-def rocm_aiter_topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> tuple[torch.Tensor, ...]:
-    torch.ops.vllm.rocm_aiter_topk_softmax(
-        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
-    )
-    return topk_weights, topk_indices
-
-
-def shuffle_weights(
-    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
-) -> tuple[torch.Tensor, ...]:
-    """
-    Applies shuffle_weight function from AITER to each
-    input tensor and returns them.
-
-    Rearranges (shuffles) the input tensor/s
-    into a specified block layout for optimized computation.
-
-    Args:
-        *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the block sizes used to divide
-            the tensors during shuffling. Default is (16, 16).
-
-    Returns:
-    A Tuple of shuffled tensors.
-    """
-    from aiter.ops.shuffle import shuffle_weight
-
-    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py
index 8b04cf4539e04..a01cdc4908b93 100644
--- a/vllm/model_executor/layers/fused_moe/routing_simulator.py
+++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py
@@ -14,6 +14,10 @@ from typing import Any
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 class RoutingStrategy(ABC):
     """Base class for token-to-expert routing strategies."""
@@ -290,6 +294,12 @@ class RoutingSimulator:
                 f"Available strategies: "
                 f"{list(RoutingSimulator._routing_strategies.keys())}"
             )
+        logger.warning_once(
+            "Simulating MoE routing using a %s strategy. "
+            "This should only be used for performance testing. "
+            "Model outputs will not be valid.",
+            strategy_name,
+        )
 
         strategy = RoutingSimulator._routing_strategies[strategy_name]
         return strategy.route_tokens(
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 2db733b765cea..3d0c5636d6c0a 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -3,7 +3,10 @@
 
 import torch
 
-from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 
 
@@ -25,14 +28,16 @@ class SharedFusedMoE(FusedMoE):
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
 
-        # Disable shared expert overlap if EP is disabled or we are not using
-        # flashinfer + DP since there is nothing to be gained in this case.
-        # Disabling the overlap optimization also prevents the shared experts
-        # from being hidden from torch.compile.
+        # Disable shared expert overlap if we are using eplb, because of
+        # correctness issues, or if using flashinfer with DP, since there
+        # is nothing to be gained in this case. Disabling the overlap
+        # optimization also prevents the shared experts from being hidden
+        # from torch.compile.
         self.use_overlapped = (
             use_overlapped
             and not (
-                self.use_ep
+                # TODO(wentao): find the root cause and remove this condition
+                self.enable_eplb
                 or (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
             )
             and self._shared_experts is not None
@@ -65,7 +70,7 @@ class SharedFusedMoE(FusedMoE):
                 # should have been created with reduce_results=False.
                 if (
                     self.reduce_results
-                    and self.tp_size > 1
+                    and get_tensor_model_parallel_world_size() > 1
                     and self.must_reduce_shared_expert_outputs()
                 ):
                     shared_out = tensor_model_parallel_all_reduce(shared_out)
@@ -81,4 +86,12 @@ class SharedFusedMoE(FusedMoE):
                 hidden_states=hidden_states,
                 router_logits=router_logits,
             )
+            # ensure early TP reduction of shared expert outputs when required
+            if (
+                shared_out is not None
+                and self.reduce_results
+                and get_tensor_model_parallel_world_size() > 1
+                and self.must_reduce_shared_expert_outputs()
+            ):
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
         return shared_out, fused_out
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index e305483eb17db..132d35e65aba8 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -127,10 +127,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
             "routing_method_type": 1,
             "do_finalize": True,
             "output": output,
-            "tune_max_num_tokens": self.max_capture_size,
+            "tune_max_num_tokens": max(self.max_capture_size, 1),
         }
 
         from flashinfer import trtllm_fp4_block_scale_routed_moe
 
-        trtllm_fp4_block_scale_routed_moe(**kwargs)
+        from vllm.utils.flashinfer import autotune
+
+        with autotune(False):
+            # Enable autotune when,
+            # https://github.com/flashinfer-ai/flashinfer/issues/2023 is
+            # resolved.
+            trtllm_fp4_block_scale_routed_moe(**kwargs)
+
         return output
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 65432c0fb2d4b..8cc374ac9155d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,18 +6,13 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.batch_invariant import (
     rms_norm_batch_invariant,
     vllm_is_batch_invariant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
-
-
-def is_rocm_aiter_rmsnorm_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER_RMSNORM and envs.VLLM_ROCM_USE_AITER
 
 
 def rms_norm(
@@ -58,80 +53,34 @@ def fused_add_rms_norm(
     return x, residual
 
 
-def rocm_aiter_rms_norm_impl(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+def poly_norm(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float
 ) -> torch.Tensor:
-    import aiter as rocm_aiter
+    from vllm import _custom_ops as ops
 
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rocm_aiter.rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
-
-    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
-
-
-def rocm_aiter_rmsnorm2d_fwd_with_add_impl(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    import aiter as rocm_aiter
-
-    residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
-    rocm_aiter.rmsnorm2d_fwd_with_add(
-        output,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
+    out = torch.empty_like(x)
+    ops.poly_norm(
+        out,
+        x,
         weight,
+        bias,
         variance_epsilon,
     )
-    return output, residual_out
+    return out
 
 
-def rocm_aiter_rms_norm_fake(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-def rocm_aiter_rmsnorm2d_fwd_with_add_fake(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_rms_norm",
-        op_func=rocm_aiter_rms_norm_impl,
-        fake_impl=rocm_aiter_rms_norm_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
-        op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-        fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-    )
-
-
-def dispatch_rocm_rmsnorm_func(with_fused_add: bool, dtype: torch.dtype):
-    use_aiter = is_rocm_aiter_rmsnorm_enabled() and dtype in [
+def dispatch_rocm_rmsnorm_func(
+    with_fused_add: bool, dtype: torch.dtype, use_aiter: bool = False
+):
+    use_aiter = use_aiter and dtype in [
         torch.float16,
         torch.bfloat16,
     ]
 
     if use_aiter and with_fused_add:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        return rocm_aiter_ops.rms_norm2d_with_add
     if use_aiter:
-        return torch.ops.vllm.rocm_aiter_rms_norm
+        return rocm_aiter_ops.rms_norm
 
     # fall back to CUDA implementation
     if with_fused_add:
@@ -169,11 +118,14 @@ class RMSNorm(CustomOp):
             self.weight = nn.Parameter(self.weight)
 
         if current_platform.is_rocm():
+            aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled()
             self.rocm_norm_func = dispatch_rocm_rmsnorm_func(
-                with_fused_add=False, dtype=weight_dtype
+                with_fused_add=False,
+                dtype=weight_dtype,
+                use_aiter=aiter_rmsnorm_enabled,
             )
             self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
-                with_fused_add=True, dtype=weight_dtype
+                with_fused_add=True, dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled
             )
 
     @staticmethod
@@ -369,6 +321,109 @@ class GemmaRMSNorm(CustomOp):
         return self.forward_native(x, residual)
 
 
+@CustomOp.register("rms_norm_gated")
+class RMSNormGated(CustomOp):
+    """RMS Normalization with optional gating.
+
+    This is a native PyTorch implementation that supports:
+    - Standard RMS normalization
+    - Group RMS normalization
+    - Optional gating with SiLU activation
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-5,
+        group_size: int | None = None,
+        norm_before_gate: bool = False,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        """Initialize RMSNormGated.
+
+        Args:
+            hidden_size: Size of the hidden dimension
+            eps: Epsilon for numerical stability
+            group_size: If not None, do GroupNorm with each group
+                        having group_size elements.
+                        group_size=None is equivalent to group_size=hidden_size
+                        (i.e. there's only 1 group).
+            norm_before_gate: If True and z is provided: out = norm(x) * silu(z)
+                              If False and z is provided: out = norm(x * silu(z))
+            device: Device to create parameters on
+            dtype: Data type for parameters
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward_native(
+        self, x: torch.Tensor, z: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        """
+        Native PyTorch implementation of RMS normalization with gating.
+
+        Args:
+            x: Input tensor
+            z: Optional gating tensor
+
+        Returns:
+            Normalized (and optionally gated) tensor
+
+        If z is not None:
+            - norm_before_gate=True: out = norm(x) * silu(z)
+            - norm_before_gate=False: out = norm(x * silu(z))
+        """
+        # Apply gating before normalization if needed
+        if z is not None and not self.norm_before_gate:
+            x = x * F.silu(z)
+
+        # RMS Normalization
+        if self.group_size is None:
+            # Standard RMS norm across the last dimension
+            variance = x.pow(2).mean(dim=-1, keepdim=True)
+            x_normed = x * torch.rsqrt(variance + self.eps)
+            out = x_normed * self.weight
+        else:
+            # Group RMS norm
+            from einops import rearrange
+
+            x_group = rearrange(x, "... (g d) -> ... g d", d=self.group_size)
+            variance = x_group.pow(2).mean(dim=-1, keepdim=True)
+            x_normed = x_group * torch.rsqrt(variance + self.eps)
+            out = rearrange(x_normed, "... g d -> ... (g d)") * self.weight
+
+        # Apply gating after normalization if needed
+        if z is not None and self.norm_before_gate:
+            out = out * F.silu(z)
+
+        return out
+
+    def forward_cuda(
+        self, x: torch.Tensor, z: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fla.ops.layernorm_guard import rmsnorm_fn
+
+        return rmsnorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            eps=self.eps,
+            group_size=self.group_size,
+            norm_before_gate=self.norm_before_gate,
+        )
+
+
 class LayerNorm(nn.Module):
     """
     Layer Normalization.
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index a9a0c216474bc..b6345b8af7f0a 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -241,18 +241,21 @@ class MambaMixer(MambaBase, CustomOp):
         forward_context: ForwardContext = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
+        assert self.cache_config is not None
+        mamba_block_size = self.cache_config.mamba_block_size
+        prefix_caching_enabled = self.cache_config.enable_prefix_caching
+
         if attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             attn_metadata = attn_metadata[self.prefix]
-            mamba1_metadata = attn_metadata
-            assert isinstance(mamba1_metadata, Mamba1AttentionMetadata)
-            query_start_loc = mamba1_metadata.query_start_loc
-            state_indices_tensor = mamba1_metadata.state_indices_tensor
+            assert isinstance(attn_metadata, Mamba1AttentionMetadata)
+            query_start_loc_p = attn_metadata.query_start_loc_p
+            state_indices_tensor = attn_metadata.state_indices_tensor
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
-            has_initial_states = mamba1_metadata.has_initial_states
-            num_padded_decodes = mamba1_metadata.num_padded_decodes
+            has_initial_states_p = attn_metadata.has_initial_states_p
+            num_padded_decodes = attn_metadata.num_padded_decodes
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -279,12 +282,8 @@ class MambaMixer(MambaBase, CustomOp):
             hidden_states_BC,
             gate,
             state_indices_tensor,
-            query_start_loc,
-            has_initial_states,
             num_prefill_tokens,
-            num_decode_tokens,
             num_prefills,
-            num_decodes,
             num_padded_decodes,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
@@ -293,8 +292,34 @@ class MambaMixer(MambaBase, CustomOp):
         gate_d = prefill_decode_split.gate_d
         state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p
         state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d
-        query_start_loc_p = prefill_decode_split.query_start_loc_p
-        has_initial_states_p = prefill_decode_split.has_initial_states_p
+
+        if prefix_caching_enabled:
+            block_idx_last_computed_token_d, block_idx_last_computed_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_computed_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+            block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = (
+                torch.split(
+                    attn_metadata.block_idx_last_scheduled_token,
+                    [num_decodes, num_prefills],
+                    dim=0,
+                )
+            )
+
+            block_idx_first_scheduled_token_p = (
+                attn_metadata.block_idx_first_scheduled_token_p
+            )
+            num_computed_tokens_p = attn_metadata.num_computed_tokens_p
+        else:
+            block_idx_last_computed_token_d = None
+            block_idx_last_computed_token_p = None
+            block_idx_last_scheduled_token_d = None
+            block_idx_last_scheduled_token_p = None
+            block_idx_first_scheduled_token_p = None
+            num_computed_tokens_p = None
 
         ssm_outputs = []
 
@@ -309,6 +334,11 @@ class MambaMixer(MambaBase, CustomOp):
                 has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
                 query_start_loc=query_start_loc_p,
+                block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
+                initial_state_idx=block_idx_last_computed_token_p,
+                num_computed_tokens=num_computed_tokens_p,
+                block_size_to_align=mamba_block_size,
             )
             # 3. State Space Model sequence transformations.
             discrete_time_step_p, B_p, C_p = self._ssm_transform(
@@ -331,10 +361,24 @@ class MambaMixer(MambaBase, CustomOp):
                 cache_indices=state_indices_tensor_p,
                 has_initial_state=has_initial_states_p,
                 query_start_loc=query_start_loc_p,
+                block_size=mamba_block_size,
+                block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
+                initial_state_idx=block_idx_last_computed_token_p,
             )
             ssm_outputs.append(scan_out_p)
 
         if has_decode:
+            if prefix_caching_enabled:
+                state_indices_tensor_d_input = state_indices_tensor_d.gather(
+                    1, block_idx_last_computed_token_d.unsqueeze(1)
+                ).squeeze(1)
+                state_indices_tensor_d_output = state_indices_tensor_d.gather(
+                    1, block_idx_last_scheduled_token_d.unsqueeze(1)
+                ).squeeze(1)
+            else:
+                state_indices_tensor_d_input = state_indices_tensor_d
+                state_indices_tensor_d_output = state_indices_tensor_d
             # 2. Convolution sequence transformation
             conv_out_d = causal_conv1d_update(
                 hidden_states_BC_d.transpose(0, 1),
@@ -343,6 +387,8 @@ class MambaMixer(MambaBase, CustomOp):
                 self.conv1d.bias,
                 self.activation,
                 conv_state_indices=state_indices_tensor_d,
+                block_idx_last_scheduled_token=block_idx_last_scheduled_token_d,
+                initial_state_idx=block_idx_last_computed_token_d,
             ).transpose(0, 1)
 
             # 3. State Space Model sequence transformation.
@@ -364,7 +410,8 @@ class MambaMixer(MambaBase, CustomOp):
                 gate_d.transpose(0, 1),
                 time_proj_bias,
                 dt_softplus=True,
-                state_batch_indices=state_indices_tensor_d,
+                state_batch_indices=state_indices_tensor_d_input,
+                dst_state_batch_indices=state_indices_tensor_d_output,
                 out=scan_outputs_d,
             )
             scan_outputs_d = scan_outputs_d.transpose(0, 1)
@@ -423,20 +470,14 @@ class PrefillDecodeSplit(NamedTuple):
     gate_d: torch.Tensor
     state_indices_tensor_p: torch.Tensor
     state_indices_tensor_d: torch.Tensor
-    query_start_loc_p: torch.Tensor | None
-    has_initial_states_p: torch.Tensor | None
 
 
 def split_batch_to_prefill_and_decode(
     hidden_states_BC: torch.Tensor,
     gate: torch.Tensor,
     state_indices_tensor: torch.Tensor,
-    query_start_loc: torch.Tensor,
-    has_initial_states: torch.Tensor | None,
     num_prefill_tokens: int,
-    num_decode_tokens: int,
     num_prefills: int,
-    num_decodes: int,
     num_padded_decodes: int,
 ) -> PrefillDecodeSplit:
     num_actual_tokens = num_prefill_tokens + num_padded_decodes
@@ -457,16 +498,6 @@ def split_batch_to_prefill_and_decode(
         [num_padded_decodes, num_prefills],
         dim=0,
     )
-    query_start_loc_p = (
-        query_start_loc[-num_prefills - 1 :] - num_padded_decodes
-        if num_prefills > 0
-        else None
-    )
-    has_initial_states_p = (
-        has_initial_states[-num_prefills:]
-        if (has_initial_states is not None and num_prefills > 0)
-        else None
-    )
 
     return PrefillDecodeSplit(
         hidden_states_BC_p=hidden_states_BC_p,
@@ -475,8 +506,6 @@ def split_batch_to_prefill_and_decode(
         gate_d=gate_d,
         state_indices_tensor_p=state_indices_tensor_p,
         state_indices_tensor_d=state_indices_tensor_d,
-        query_start_loc_p=query_start_loc_p,
-        has_initial_states_p=has_initial_states_p,
     )
 
 
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 8722eb9a7b22f..53fd5d5458b09 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -375,6 +375,10 @@ def selective_scan_fn(
     cache_indices=None,
     has_initial_state=None,
     pad_slot_id=PAD_SLOT_ID,
+    block_size=1024,
+    block_idx_first_scheduled_token=None,
+    block_idx_last_scheduled_token=None,
+    initial_state_idx=None,
 ) -> torch.Tensor:
     """
     u: (dim, total_length) for varlen or (batch, dim, seqlen)
@@ -397,7 +401,10 @@ def selective_scan_fn(
         x.shape=(dim,17)
     cache_indices: (batch) int32
         A tensor with each cell is a correspondent
-        input and output ssm_state index
+        input and output ssm_state indices
+      - Without APC: (batch,) - single state index per batch item
+      - With APC: (batch, max_positions) - cache block indices for read/write
+        Each non-zero value indicates a cache block to load from and/or write to.
     has_initial_state: (batch) bool
         A tensor populated with ones and zeros,
         indicate if the ssm_state at the corresponding index should be
@@ -408,6 +415,17 @@ def selective_scan_fn(
         that will not be processed,
         for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
         in this case, the kernel will not process entries at indices 0 and 3
+    block_size: int
+        The block size to align the cached states to
+    block_idx_first_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the first
+        cache block to be filled is located.
+    block_idx_last_scheduled_token: (batch,), dtype int32
+        The pointer into cache_indices, where the last cache block
+        to be filled is located.
+    initial_state_idx: (batch,), dtype int32
+        The pointer into cache_indices, where the cache block
+        containing the initial state is located.
     returns
         output: (dim, total_length) for varlen or (batch, dim, seqlen)
                 supports inplace replacement
@@ -448,6 +466,10 @@ def selective_scan_fn(
         has_initial_state,
         ssm_states,
         pad_slot_id,
+        block_size,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
     )
 
     if z is None:
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index daf7422963f3c..3e1f87b59a34d 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -617,8 +617,6 @@ class AWQMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
 
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index ccd9b311cc932..e5a741e639ad9 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -518,12 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `BitsAndBytesMoEMethod` yet."
             )
+
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index bf38c15b47013..59567f2ca13c7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -12,6 +12,7 @@ from compressed_tensors.quantization import ActivationOrdering, QuantizationStra
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -462,12 +463,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin.
-        #
         if self.use_marlin:
-            assert self.fused_experts is None
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -488,24 +484,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 workspace=layer.workspace,
             )
 
-        elif self.fused_experts is not None:
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight
-            ), "Flashinfer CUTLASS Fused MoE not applicable!"
-
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-
         # FlashInfer fused experts path
         elif self.allow_flashinfer:
             from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
@@ -605,11 +583,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         # Disable marlin for rocm
         if current_platform.is_rocm():
             self.use_marlin = False
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled,
-        )
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # cutlass path
         self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
@@ -852,12 +827,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                shuffle_weights,
-            )
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data
             )
 
@@ -995,10 +966,18 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
             )
         else:
             logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
-            return TritonOrDeepGemmExperts(self.moe_quant_config, allow_deep_gemm=True)
+            return TritonOrDeepGemmExperts(
+                self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
+            )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -1066,13 +1045,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
         per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # cutlass fp8 or fused_experts but not marlin or rocm.
-        #
         if self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
-            assert self.fused_experts is None
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1098,7 +1072,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
             assert per_act_token == per_channel_quant
             assert self.moe_quant_config is not None
-            assert self.fused_experts is None
             return rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1111,18 +1084,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 quant_config=self.moe_quant_config,
             )
 
-        elif self.fused_experts is not None:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-            )
-
         # cutlass path
         elif self.use_cutlass:
             assert self.moe_quant_config is not None
@@ -1318,8 +1279,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
@@ -1636,8 +1595,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
@@ -1901,8 +1858,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 4127cd2d574bd..b603bdb13280b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -50,6 +50,9 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index dc735a115e0b1..01204b10ea18a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -7,6 +7,7 @@ import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from torch.nn import Parameter
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
@@ -16,7 +17,6 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     W8A8BlockFp8LinearOp,
-    check_aiter_fp8_linear_support,
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
@@ -73,7 +73,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
 
         if self.weight_block_size is not None:
             self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-            self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+            self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
             assert not self.is_static_input_scheme
             self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
             self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 754608af97c6b..5241f9a2301be 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -158,8 +158,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ExpertsInt8MoEMethod` yet."
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6b613b21066a6..91115d7437e25 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -27,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import (
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
+    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
@@ -41,7 +43,6 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
     init_fp8_linear_kernel,
 )
@@ -59,7 +60,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     W8A8BlockFp8LinearOp,
-    check_aiter_fp8_linear_support,
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
@@ -99,11 +99,9 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
-    fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
-    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.flashinfer import has_flashinfer_moe
 from vllm.utils.import_utils import has_deep_gemm
@@ -143,6 +141,13 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
+            if block_quant:
+                raise ValueError(
+                    "FlashInfer FP8 MoE throughput backend does not "
+                    "support block quantization. Please use "
+                    "VLLM_FLASHINFER_MOE_BACKEND=latency "
+                    "instead."
+                )
             logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
@@ -158,7 +163,7 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         return Fp8MoeBackend.MARLIN
 
     # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and block_quant:
+    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
         if not has_deep_gemm():
             logger.warning_once("DeepGEMM backend requested but not available.")
         elif is_deep_gemm_supported():
@@ -367,7 +372,7 @@ class Fp8LinearMethod(LinearMethodBase):
         if vllm_is_batch_invariant():
             self.use_marlin = False
 
-        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
         self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
@@ -556,83 +561,19 @@ class Fp8LinearMethod(LinearMethodBase):
         # if batch invariant mode is enabled, prefer DeepGEMM FP8 path
         # we will use BF16 dequant when DeepGEMM is not supported.
         if vllm_is_batch_invariant():
-            # Call is_deep_gemm_supported() ahead of time for torch.compile
-            # dynamo has trouble tracing through
-            if self.block_quant and should_use_deepgemm_for_fp8_linear(
-                torch.bfloat16, layer.weight, self.use_deep_gemm
-            ):
-                # use group quant consistent with block size across K
-                assert self.act_q_group_shape is not None
-                q_input, input_scale = QuantFP8(
-                    False,
-                    self.act_q_group_shape,
-                    column_major_scales=True,
-                )(x)
-
-                output_2d = torch.empty(
-                    (q_input.shape[0], layer.weight.shape[0]),
-                    dtype=torch.bfloat16,
-                    device=q_input.device,
-                )
-                fp8_gemm_nt(
-                    (q_input, input_scale),
-                    (layer.weight, layer.weight_scale),
-                    output_2d,
-                )
-                if bias is not None:
-                    output_2d = output_2d + bias
-                return output_2d
-
-            # Dequantize FP8 weights to BF16
-            weight_fp8 = layer.weight.to(torch.bfloat16)
-            weight_scale = layer.weight_scale.to(torch.bfloat16)
-
-            # Handle different quantization granularities
             if self.block_quant:
-                # Block-wise quantization:
-                # - Weight is NOT transposed, shape is [N, K] (output_size, input_size)
-                # - Scale has shape [num_blocks_k, num_blocks_n] (TRANSPOSED!)
                 assert self.weight_block_size is not None
-                block_n, block_k = self.weight_block_size  # Note: order is [N, K]
-
-                N, K = weight_fp8.shape
-
-                # determine expected number of blocks along N and K
-                num_blocks_n = (N + block_n - 1) // block_n
-                num_blocks_k = (K + block_k - 1) // block_k
-
-                # scale layout may be [num_blocks_n, num_blocks_k]
-                # or [num_blocks_k, num_blocks_n] depending on backend
-                if weight_scale.dim() != 2:
-                    raise RuntimeError(
-                        f"FP8 block scale must be 2D, got {tuple(weight_scale.shape)}"
-                    )
-
-                scale_rows, scale_cols = weight_scale.shape
-                if (scale_rows, scale_cols) == (num_blocks_k, num_blocks_n):
-                    if num_blocks_n == num_blocks_k:
-                        # ambiguous square case, warn and skip transpose
-                        logger.warning(
-                            "Batch-invariant FP8: square block-scale %dx%d; "
-                            "skipping transpose to avoid misorientation.",
-                            scale_rows,
-                            scale_cols,
-                        )
-                    else:
-                        # clear KN -> transpose to NK
-                        weight_scale = weight_scale.t()
-
-                # Expand scale to match weight dimensions
-                # scale_expanded should have shape [N, K]
-                scale_expanded = weight_scale.repeat_interleave(
-                    block_n, dim=0
-                ).repeat_interleave(block_k, dim=1)
-                # Trim to exact weight size (in case of padding)
-                scale_expanded = scale_expanded[:N, :K]
-                weight_bf16 = weight_fp8 * scale_expanded
+                return self.w8a8_block_fp8_linear.apply(
+                    input=x,
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=layer.input_scale,
+                    bias=bias,
+                )
             else:
-                # Per-tensor quantization: weight IS transposed to [K, N]
-                # scale should be scalar or [1] or per-output-channel [N]
+                # per-tensor/channel: dequant to BF16 and run GEMM
+                weight_fp8 = layer.weight.to(torch.bfloat16)
+                weight_scale = layer.weight_scale.to(torch.bfloat16)
                 if weight_scale.numel() == 1:
                     # Per-tensor: simple scalar multiplication
                     weight_bf16 = weight_fp8 * weight_scale
@@ -651,16 +592,7 @@ class Fp8LinearMethod(LinearMethodBase):
                     else:
                         # Fallback
                         weight_bf16 = weight_fp8 * weight_scale
-
-            # For block quant, weight is [N, K], for per-tensor it's [K, N]
-            # F.linear expects weight to be [N, K], so:
-            if self.block_quant:
-                # Already in correct shape [N, K]
-                output = torch.nn.functional.linear(x, weight_bf16, bias)
-            else:
-                # Need to transpose back: [K, N] -> [N, K]
-                output = torch.nn.functional.linear(x, weight_bf16.t(), bias)
-            return output
+                return torch.nn.functional.linear(x, weight_bf16.t(), bias)
 
         if self.use_marlin:
             return apply_fp8_marlin_linear(
@@ -706,9 +638,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-
-        self.fused_experts: mk.FusedMoEModularKernel | None = None  # type: ignore
-
         self.fp8_backend = get_fp8_moe_backend(self.block_quant)
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
@@ -868,12 +797,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled,
-            shuffle_weights,
-        )
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
@@ -915,7 +840,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
                 )
 
@@ -961,7 +886,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
@@ -1041,7 +966,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
@@ -1184,6 +1109,14 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             block_shape=self.weight_block_size,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1213,29 +1146,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
 
-        if (
-            self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-            and self.fused_experts is None
-        ):
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
-            assert scoring_func == "sigmoid", (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}"
-            )
+
             if self.block_quant:
                 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
 
-                assert (
-                    renormalize and use_grouped_topk and custom_routing_function is None
-                )
                 e_score_correction_bias = (
                     e_score_correction_bias.to(x.dtype)
                     if e_score_correction_bias is not None
                     else None
                 )
+                routing_method_type = layer.routing_method_type
                 return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                    routing_logits=router_logits.to(torch.float32),
+                    routing_logits=router_logits.to(torch.float32)
+                    if routing_method_type == RoutingMethodType.DeepSeekV3
+                    else router_logits,
                     routing_bias=e_score_correction_bias,
                     x=x,
                     w13_weight=layer.w13_weight,
@@ -1250,6 +1178,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.weight_block_size,
+                    routing_method_type=routing_method_type,
                     routed_scaling=routed_scaling_factor,
                 )
             else:
@@ -1293,10 +1222,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
-        #
-        # Note: the order of checks is important since self.fused_experts
-        # can override fused_experts or cutlass but not rocm or marlin.
-        #
         topk_weights, topk_ids, zero_expert_result = select_result
 
         if self.rocm_aiter_moe_enabled:
@@ -1304,7 +1229,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 rocm_aiter_fused_experts,
             )
 
-            assert self.fused_experts is None
             result = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
@@ -1318,7 +1242,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
         elif self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
-            assert self.fused_experts is None
             result = fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1336,19 +1259,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 expert_map=expert_map,
                 workspace=layer.workspace,
             )
-        elif self.fused_experts:
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
-            )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             assert not self.block_quant
             assert not renormalize and custom_routing_function is not None
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 8a914c57a9f7d..caabcd0ca0ee5 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -585,8 +585,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 0d5439357fda2..42a569e7770c0 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -742,8 +742,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet."
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 5b3aabfde0c1e..e0234191c62bf 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -399,6 +399,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
         import intel_extension_for_pytorch as ipex
 
+        ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
         layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
             layer.w13_weight,
             layer.w2_weight,
@@ -407,6 +408,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
             a1_scale_inv=layer.w13_input_scale,
             a2_scale_inv=layer.w2_input_scale,
             use_prepack=True,
+            experts_start_id=ep_rank_start,
         )
 
     def get_fused_moe_quant_config(
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 3ac90553bbc74..8ebb863479601 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -4,54 +4,14 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
 
 from .cutlass import CutlassScaledMMLinearKernel
 from .ScaledMMLinearKernel import Int8ScaledMMLinearLayerConfig
 
 
-def rocm_aiter_gemm_w8a8_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: torch.Tensor | None = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    from aiter import gemm_a8w8_CK
-
-    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
-    # a to be [M, K]
-    # b to be [N, K]
-    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
-
-
-def rocm_aiter_gemm_w8a8_fake(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: torch.Tensor | None = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    m = A.shape[0]
-    n = B.shape[0]
-    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
-    return Y
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8",
-        op_func=rocm_aiter_gemm_w8a8_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_fake,
-    )
-
-
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
@@ -75,7 +35,7 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
                 + "installed on ROCm.",
             )
         # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
-        if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER):
+        if not (rocm_aiter_ops.is_linear_enabled()):
             return (
                 False,
                 "AiterScaledMMLinearKernel is disabled. "
@@ -154,6 +114,4 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         # a to be [M, K]
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8(
-            x_q, w_q.t(), x_s, w_s, bias, out_dtype
-        )
+        return rocm_aiter_ops.gemm_w8a8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 53b25af440353..79bcb61dc5060 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -18,9 +18,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    is_valid_flashinfer_cutlass_fused_moe,
-)
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
@@ -226,7 +223,10 @@ class ModelOptFp8Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
 
         if isinstance(layer, LinearBase):
             if self.is_layer_excluded(prefix):
@@ -235,7 +235,7 @@ class ModelOptFp8Config(QuantizationConfig):
             if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelOptFp8MoEMethod(self, layer)
@@ -569,9 +569,13 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
+            g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(),
             w2_scale=layer.w2_weight_scale,
+            g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(),
             a1_scale=layer.w13_input_scale,
+            a1_gscale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
+            a2_gscale=1.0 / layer.w2_input_scale,
             per_act_token_quant=False,
         )
 
@@ -604,7 +608,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
-            assert self.fused_experts is None
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -637,24 +640,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # cutlass or fused_experts.
-        #
-        if self.fused_experts is not None:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                inplace=False,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             assert not renormalize
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
@@ -904,7 +890,10 @@ class ModelOptNvFp4Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
 
         skip_layer = self.is_layer_excluded(prefix)
         if isinstance(layer, LinearBase):
@@ -914,7 +903,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
             if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             if skip_layer:
@@ -957,6 +946,9 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(
@@ -1158,8 +1150,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         moe: FusedMoEConfig,
         layer: torch.nn.Module,
     ) -> None:
-        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
-            detect_nvfp4_moe_support,
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
+            detect_nvfp4_moe_support,  # noqa: E501
         )
 
         super().__init__(moe)
@@ -1646,8 +1638,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
             from vllm.model_executor.models.llama4 import Llama4MoE
 
-            assert self.fused_experts is None
-
             a1_gscale = layer.w13_input_scale_quant
             (hidden_states_fp4, hidden_states_scale_linear_fp4) = (
                 flashinfer.fp4_quantize(
@@ -1719,13 +1709,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin or
-        # trtllm.
-        #
         if self.use_marlin:
-            assert self.fused_experts is None
             return fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1746,23 +1730,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 workspace=layer.workspace,
             )
 
-        elif self.fused_experts is not None:
-            assert (
-                self.allow_flashinfer
-                and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        elif (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        ):
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                flashinfer_cutlass_moe_fp4,
             )
 
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight
-            ), "Flashinfer CUTLASS Fused MoE not applicable!"
+            assert self.moe_quant_config is not None
 
-            return self.fused_experts(
+            return flashinfer_cutlass_moe_fp4(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                quant_config=self.moe_quant_config,
+                inplace=False,
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index b0a268b9950b7..2090c86f78dc8 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -226,7 +226,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        self.moe = layer
         layer.quant_config = self.quant_config
         bit8_pack_factor = self.quant_config.bit8_pack_factor
         group_size = self.quant_config.group_size
@@ -381,7 +380,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
 
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 597ee1b6bafe1..e339f15510d79 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -43,6 +43,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     _can_support_mxfp4,
     _swizzle_mxfp4,
+    get_padding_alignment,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
 from vllm.model_executor.utils import set_weight_attrs
@@ -142,6 +143,9 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
         else:
             logger.info_once("Using Triton backend")
             return Mxfp4Backend.TRITON
+    elif current_platform.is_xpu():
+        logger.info_once("Using ipex marlin backend on XPU")
+        return Mxfp4Backend.MARLIN
     elif current_platform.is_rocm() and has_triton_kernels():
         logger.info_once("Using Triton backend")
         return Mxfp4Backend.TRITON
@@ -188,7 +192,10 @@ class Mxfp4Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             raise NotImplementedError("Mxfp4 linear layer is not implemented")
         elif isinstance(layer, FusedMoE):
-            return Mxfp4MoEMethod(layer.moe_config)
+            if current_platform.is_xpu():
+                return IpexMxfp4MoEMethod(layer.moe_config)
+            else:
+                return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
             raise NotImplementedError("Mxfp4 attention layer is not implemented")
         return None
@@ -197,8 +204,6 @@ class Mxfp4Config(QuantizationConfig):
 class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
-        self.topk_indices_dtype = None
-        self.moe = moe
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -247,7 +252,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 128
             )
-            hidden_size = round_up(hidden_size, 256)
+            if current_platform.is_xpu():
+                hidden_size = round_up(hidden_size, 128)
+            else:
+                hidden_size = round_up(hidden_size, 256)
 
             layer.params_dtype = params_dtype
             layer.num_experts = num_experts
@@ -275,10 +283,11 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             )
             hidden_size = round_up(hidden_size, 128)
         elif current_platform.is_rocm():
+            pad_align = get_padding_alignment()
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 256
+                intermediate_size_per_partition, pad_align
             )
-            hidden_size = round_up(hidden_size, 256)
+            hidden_size = round_up(hidden_size, pad_align)
         else:
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 64
@@ -815,6 +824,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                     "EP batched experts format"
                 )
         else:
+            layer.w13_weight = (
+                self.w13_weight_triton_tensor
+                if layer.w13_weight is None
+                else layer.w13_weight
+            )
+            layer.w2_weight = (
+                self.w2_weight_triton_tensor
+                if layer.w2_weight is None
+                else layer.w2_weight
+            )
+            assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
+
             assert self.moe_quant_config is not None
             if (
                 self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
@@ -838,71 +859,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                     f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP"
                 )
 
-    def _route_and_experts(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-        )
-
-        w13_weight = (
-            self.w13_weight_triton_tensor
-            if layer.w13_weight is None
-            else layer.w13_weight
-        )
-        w2_weight = (
-            self.w2_weight_triton_tensor if layer.w2_weight is None else layer.w2_weight
-        )
-        assert all([w is not None for w in [w13_weight, w2_weight]])
-
-        return self.fused_experts(
-            hidden_states=x,
-            w1=w13_weight,
-            w2=w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-        )
+    @property
+    def allow_inplace(self) -> bool:
+        return True
 
     def apply(
         self,
@@ -930,29 +889,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
-        if self.fused_experts is not None:
-            return self._route_and_experts(
-                layer,
-                x,
-                router_logits,
-                top_k,
-                renormalize,
-                use_grouped_topk,
-                topk_group,
-                num_expert_group,
-                global_num_experts,
-                expert_map,
-                custom_routing_function,
-                scoring_func,
-                e_score_correction_bias,
-                apply_router_weight_on_input,
-                activation,
-                enable_eplb,
-                expert_load_view,
-                logical_to_physical_map,
-                logical_replica_count,
-            )
-
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             topk_weights, topk_ids, _ = FusedMoE.select_experts(
                 hidden_states=x,
@@ -1047,7 +983,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 None,
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
             )[0]
             return trtllm_gen_output
         elif (
@@ -1122,7 +1058,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 tp_rank=self.moe.tp_rank,
                 ep_size=self.moe.ep_size,
                 ep_rank=self.moe.ep_rank,
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
                 **extra_kwargs,
             )
 
@@ -1146,3 +1082,86 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             )
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
+
+
+class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
+    def __init__(self, moe_config: FusedMoEConfig):
+        super().__init__(moe_config)
+        self.moe_config = moe_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        super().create_weights(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+        self.original_hidden_size = hidden_size
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        import intel_extension_for_pytorch as ipex
+
+        layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
+        layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
+        ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            w1_scale_inv=layer.w13_weight_scale,
+            w2_scale_inv=layer.w2_weight_scale,
+            w13_bias=layer.w13_bias,
+            w2_bias=layer.w2_bias,
+            is_mxfp4=True,
+            experts_start_id=ep_rank_start,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert activation == "swigluoai", (
+            "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
+        )  # noqa:
+        hidden_size_pad = round_up(self.original_hidden_size, 128)
+        x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
+        hidden_states = layer.ipex_fusion(
+            x_pad,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            activation="swiglu_oai",
+        )
+        hidden_states = hidden_states[..., : self.original_hidden_size].contiguous()
+        return hidden_states
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index a8f4b1b0db68d..30772c3665b06 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -8,6 +8,7 @@ import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE,
@@ -21,10 +22,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     ocp_mx_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled,
-    use_mxfp4_aiter_moe,
-)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin,
 )
@@ -122,7 +119,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         if current_platform.is_rocm():
             self.use_marlin = False
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
     def create_weights(
         self,
@@ -309,30 +306,19 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 )
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                rocm_aiter_fused_experts,
-                shuffle_weights,
-            )
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data
             )
 
             layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
-            self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts
         elif self.use_marlin:
             prepare_moe_fp8_layer_for_marlin(layer, False)
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
-            self.fused_experts_func = None
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-
-            self.fused_experts_func = fused_experts
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -369,8 +355,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
@@ -392,7 +376,11 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         )
 
         if self.rocm_aiter_moe_enabled:
-            return self.rocm_aiter_fused_experts_func(
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
+            )
+
+            return rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -403,7 +391,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 quant_config=self.moe_quant_config,
                 expert_map=expert_map,
             )
-        if self.use_marlin:
+        elif self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
             return fused_marlin_moe(
                 x,
@@ -421,22 +409,22 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
 
-        assert self.fused_experts_func is not None
-
-        return self.fused_experts_func(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            quant_config=self.moe_quant_config,
-        )
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                quant_config=self.moe_quant_config,
+            )
 
 
 class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
@@ -463,6 +451,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
 
         self.weight_dtype = self.weight_quant["dtype"].replace("fp", "mxfp")
         self.input_dtype = self.input_quant["dtype"].replace("fp", "mxfp")
+        self.fp4_dtype = getattr(torch, "float4_e2m1fn_x2", None)
 
         self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
             self.input_dtype, self.weight_dtype
@@ -474,13 +463,15 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
                 "not implemented. Please open an issue."
             )
 
+        self.use_rocm_aiter_moe = rocm_aiter_ops.is_fused_moe_enabled()
+
         self.emulate = not current_platform.supports_mx() or not (
-            use_mxfp4_aiter_moe() and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
+            self.use_rocm_aiter_moe and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
         )
         if self.emulate:
             logger.warning_once(
                 f"The current mode (supports_mx={current_platform.supports_mx()}, "
-                f"use_mxfp4_aiter_moe={use_mxfp4_aiter_moe()}, "
+                f"use_mxfp4_aiter_moe={self.use_rocm_aiter_moe}, "
                 f"ocp_mx_scheme={self.ocp_mx_scheme}) "
                 "does not support native MXFP4/MXFP6 "
                 "computation. Simulated weight dequantization and activation "
@@ -586,6 +577,17 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1)
         w2_weight_scale = e8m0_shuffle(w2_weight_scale)
         layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1)
+
+        if self.fp4_dtype is not None:
+            layer.w13_weight = torch.nn.Parameter(
+                layer.w13_weight.view(self.fp4_dtype),
+                requires_grad=layer.w13_weight.requires_grad,
+            )
+            layer.w2_weight = torch.nn.Parameter(
+                layer.w2_weight.view(self.fp4_dtype),
+                requires_grad=layer.w2_weight.requires_grad,
+            )
+
         torch.cuda.empty_cache()
 
     def get_fused_moe_quant_config(
@@ -601,6 +603,10 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
             block_shape=None,
         )
 
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -624,8 +630,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
@@ -647,28 +651,18 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         )
 
         if not self.emulate:
-            from aiter import ActivationType, QuantType
-            from aiter.fused_moe import fused_moe
-
-            aiter_acts = {
-                ActivationType.No.name.lower(): ActivationType.No,
-                ActivationType.Silu.name.lower(): ActivationType.Silu,
-                ActivationType.Gelu.name.lower(): ActivationType.Gelu,
-            }
-            assert activation in aiter_acts, (
-                f"Aiter CK fp4 MoE doesn't support activation {activation}"
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
             )
-            out = fused_moe(
+
+            out = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                quant_type=QuantType.per_1x32,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                activation=aiter_acts[activation],
-                doweight_stage1=False,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                quant_config=self.moe_quant_config,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index c25c522dea55f..007e78e68d5cd 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -31,6 +31,13 @@ from .quark_scheme import QuarkScheme
 logger = init_logger(__name__)
 
 
+# TODO: move registration of custom op to aiter_ops.py
+# `from vllm._aiter_ops import rocm_aiter_ops`
+# use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
+# for envs checks which does not require @cache anymore.
+# triton kernel is torch compile compatible.
+# does not require direct registeration.
+# use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
 @cache
 def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
     return (
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index e4f7ff8339569..52656263a601b 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -377,8 +377,6 @@ class RTNMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
 
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index f42c45dae76d2..3fee71e193db5 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
 import json
+import types
 from importlib.util import find_spec
 from typing import Any, Optional
 
@@ -27,6 +28,39 @@ from vllm.model_executor.utils import set_weight_attrs
 logger = init_logger(__name__)
 
 
+def _bond_method_to_cls(func, obj):
+    if hasattr(func, "__self__") or not callable(func):
+        # If the function is already bound to an instance, return it as is
+        return func
+    else:
+        return types.MethodType(func, obj)
+
+
+def _get_weight_attrs(param):
+    # record attributes attached to the weight, so we can
+    # recover later
+    recorded_weight_attr = {}
+    for key in param.__dict__:
+        if hasattr(param, key):
+            attr = getattr(param, key)
+            if not callable(attr):
+                recorded_weight_attr[key] = attr
+            elif hasattr(attr, "__self__") and param is attr.__self__:
+                # if attr is a bonded method for an instance, and
+                # attr.__self__ points to the instance (param)
+                # we'll record the underlying function object
+                recorded_weight_attr[key] = attr.__func__
+            else:
+                recorded_weight_attr[key] = attr
+    return recorded_weight_attr
+
+
+def _restore_weight_attrs(param, recorded_weight_attr):
+    for attr_name, attr in recorded_weight_attr.items():
+        if not hasattr(param, attr_name):
+            setattr(param, attr_name, _bond_method_to_cls(attr, param))
+
+
 def torchao_version_at_least(torchao_version: str) -> bool:
     if find_spec("torchao"):
         try:
@@ -57,6 +91,14 @@ def should_skip(prefix: str, skip_modules: list[str]) -> bool:
     return False
 
 
+if torchao_version_at_least("0.15.0"):
+    from torchao.prototype.tensor_conversion.api import (
+        convert_to_packed_tensor_based_on_current_hardware,
+    )
+else:
+    convert_to_packed_tensor_based_on_current_hardware = lambda t: t
+
+
 class TorchAOConfig(QuantizationConfig):
     """Config class for torchao."""
 
@@ -307,12 +349,32 @@ class TorchAOLinearMethod(LinearMethodBase):
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if self.quant_config.is_checkpoint_torchao_serialized:
+            if not hasattr(layer, "weight"):
+                return
+
+            # record attributes attached to the weight, so we can
+            # recover later
+            recorded_weight_attr = _get_weight_attrs(layer.weight)
+
+            layer.weight = Parameter(
+                convert_to_packed_tensor_based_on_current_hardware(layer.weight),
+                requires_grad=layer.weight.requires_grad,
+            )
+
+            _restore_weight_attrs(layer.weight, recorded_weight_attr)
             return
 
-        # quantize the weight on the fly if the checkpoint is not already
+        # online quantize the weight if the checkpoint is not already
         # quantized by torchao
+        recorded_weight_attr = _get_weight_attrs(layer.weight)
+
         weight = torchao_quantize_param_data(
             layer.weight, self.quant_config.torchao_config
         )
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        weight = torch.nn.Parameter(
+            convert_to_packed_tensor_based_on_current_hardware(weight),
+            weight.requires_grad,
+        )
+
+        _restore_weight_attrs(weight, recorded_weight_attr)
         layer.register_parameter("weight", weight)
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 50ea049c3d5a1..e49d374f154d8 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -27,20 +27,25 @@ class FlashinferMoeBackend(Enum):
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
+    from flashinfer import next_positive_power_of_2
+
     # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
     # TODO: Revert this to dynamic calculation once a new version of FlashInfer
     # with the necessary kernels is released.
     tile_tokens_dim = 8
 
-    # from flashinfer import next_positive_power_of_2
-
-    # # Guess tokens per expert assuming perfect expert distribution first.
-    # num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # # And pad the number to the next power of 2.
-    # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # # Cap to 8-64 tokens per CTA tile as it's the range supported by the
-    # # kernel.
-    # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    # A factor considering tokens are not perfectly balanced among experts.
+    imbalance_factor = 1.3
+    # Calculate the number of tokens per expert
+    # assuming perfect distribution.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # Apply the imbalance factor.
+    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-max_tile_tokens_dim tokens per CTA tile
+    # as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
 
     return tile_tokens_dim
 
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f25148abb619c..c63196b893574 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -12,6 +12,7 @@ import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -68,50 +69,6 @@ def cutlass_scaled_mm(
     )
 
 
-def rocm_aiter_gemm_w8a8_blockscale_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: list[int],
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    import aiter as rocm_aiter
-
-    return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
-
-
-def rocm_aiter_gemm_w8a8_blockscale_fake(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: list[int],
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    m = A.shape[0]
-    n = B.shape[0]
-    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
-    return Y
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8_blockscale",
-        op_func=rocm_aiter_gemm_w8a8_blockscale_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
-    )
-    if (
-        envs.VLLM_ROCM_USE_AITER
-        and envs.VLLM_ROCM_USE_AITER_LINEAR
-        and current_platform.is_fp8_fnuz()
-    ):
-        import aiter as rocm_aiter
-        from aiter import get_hip_quant
-
-        aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
-
-
 # TODO we should be able to change the type of block_size to GroupShape
 # after we resolve GroupShape compilation issue
 # https://github.com/vllm-project/vllm/issues/25270
@@ -293,7 +250,9 @@ class W8A8BlockFp8LinearOp:
         ):
             output = self._run_deepgemm(input_2d, weight, weight_scale)
         else:
-            output = self.w8a8_blockscale_op(input_2d, weight, weight_scale)
+            output = self.w8a8_blockscale_op(
+                input_2d, weight, weight_scale, input_scale
+            )
 
         if bias is not None:
             output = output + bias
@@ -322,7 +281,9 @@ class W8A8BlockFp8LinearOp:
         input_2d: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        assert input_scale is None
         assert self.input_quant_op is not None
         q_input, input_scale = self.input_quant_op(input_2d)
         if self.is_hopper:
@@ -350,18 +311,43 @@ class W8A8BlockFp8LinearOp:
         input_2d: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert self.act_quant_group_shape == GroupShape(1, 128)
-        q_input, input_scale = aiter_per1x128_quant(
-            input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8
+
+        n, k = weight.shape
+
+        use_triton = (
+            not current_platform.is_fp8_fnuz()
+            and rocm_aiter_ops.is_triton_gemm_w8a8_tuned(n, k)
         )
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale(
+
+        if use_triton:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
+        else:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_w8a8_blockscale
+
+        if input_scale is not None:
+            q_input = input_2d
+        # MI350 case uses triton kernel
+        elif use_triton:
+            q_input, input_scale = per_token_group_quant_fp8(
+                input_2d,
+                self.act_quant_group_shape.col,
+                column_major_scales=False,
+                use_ue8m0=False,
+            )
+        # MI300 uses tuned AITER ASM/C++ kernel
+        else:
+            q_input, input_scale = rocm_aiter_ops.per_1x128_fp8_quant(input_2d)
+
+        return gemm_a8w8_blockscale_op(
             q_input,
             weight,
             input_scale,
             weight_scale,
             list(self.weight_group_shape),
-            input_2d.dtype,
+            output_dtype=input_2d.dtype,
         )
 
     def _run_triton(
@@ -369,7 +355,9 @@ class W8A8BlockFp8LinearOp:
         input_2d: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        input_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        assert input_scale is None
         assert self.input_quant_op is not None
         q_input, input_scale = self.input_quant_op(input_2d)
         return torch.ops.vllm.w8a8_triton_block_scaled_mm_func(
@@ -391,6 +379,7 @@ class W8A8BlockFp8LinearOp:
                 torch.Tensor,
                 torch.Tensor,
                 torch.Tensor,
+                torch.Tensor | None,
             ],
             torch.Tensor,
         ],
@@ -938,17 +927,6 @@ def requant_weight_ue8m0_inplace(
         s_old.copy_(s_requant)
 
 
-def check_aiter_fp8_linear_support() -> bool:
-    """AITER is only supported on ROCm and only for FP8_FNUZ
-    and at the moment are MI300 series"""
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER
-        and envs.VLLM_ROCM_USE_AITER_LINEAR
-        and current_platform.is_fp8_fnuz()
-    )
-
-
 def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
     """Pad the weight tensor. This is an optimization on ROCm platform, which
     can benefit from tensors located far enough from one another in memory"""
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 5e87cadfb1070..34a31bcf6a747 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -99,6 +100,14 @@ def _can_support_mxfp4(
     )
 
 
+def get_padding_alignment():
+    return (
+        256
+        if triton.runtime.driver.active.get_current_target().arch in ("gfx950",)
+        else 128
+    )
+
+
 def _dequant_mxfp4(
     x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
 ) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 64187c97cab7e..56c165f9c041a 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -191,9 +191,16 @@ def get_rope(
                 k: v
                 for k, v in rope_scaling.items()
                 if k
-                in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow")
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "apply_yarn_scaling",
+                )
             }
             if "mrope_section" in rope_scaling:
+                extra_kwargs.pop("apply_yarn_scaling", None)
                 rotary_emb = MRotaryEmbedding(
                     head_size,
                     rotary_dim,
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 91276320df4d0..2ef54e75df44e 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -4,13 +4,10 @@
 
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import (
-    is_rocm_triton_rotary_embedding_enabled,
-    rocm_aiter_rotary_emb,
-)
 
 
 @CustomOp.register("rotary_embedding")
@@ -48,8 +45,8 @@ class RotaryEmbeddingBase(CustomOp):
             cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_triton_rotary_embedding_enabled = (
-            is_rocm_triton_rotary_embedding_enabled()
+        self.is_rocm_triton_rotary_embed_enabled = (
+            rocm_aiter_ops.is_triton_rotary_embed_enabled()
         )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
@@ -169,9 +166,9 @@ class RotaryEmbedding(RotaryEmbeddingBase):
         query: torch.Tensor,
         key: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        if self.is_rocm_triton_rotary_embedding_enabled:
+        if self.is_rocm_triton_rotary_embed_enabled:
             self._match_cos_sin_cache_dtype(query)
-            rocm_aiter_rotary_emb(
+            rocm_aiter_ops.triton_rotary_embed(
                 positions,
                 query,
                 key,
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 9e6ec9fdd523c..196533b617959 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -77,7 +77,11 @@ def dispatch_rotary_emb_function(
     if current_platform.is_cuda():
         return apply_rotary_emb
 
-    if current_platform.is_rocm():
+    # if torch compile is not enabled
+    # use rotary embedding function from flash_attn package
+    # otherwise use the naive pytorch embedding implementation
+    # is faster when torch compile is enabled.
+    if current_platform.is_rocm() and not torch.compiler.is_compiling():
         if find_spec("flash_attn") is not None:
             from flash_attn.ops.triton.rotary import apply_rotary
 
@@ -87,11 +91,10 @@ def dispatch_rotary_emb_function(
                 "flash_attn is not installed. Falling back to PyTorch "
                 "implementation for rotary embeddings."
             )
-
     if default is not None:
         return default
-    else:
-        return apply_rotary_emb_torch
+
+    return apply_rotary_emb_torch
 
 
 # yarn functions
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index d9134f05fddff..e72834e473c15 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -146,6 +146,15 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
             key = key_rot
         return query, key
 
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
deleted file mode 100644
index a01d14f7b3a13..0000000000000
--- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.envs as envs
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
-
-
-def is_rocm_triton_rotary_embedding_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER
-        and envs.VLLM_ROCM_USE_TRITON_ROPE
-    )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter.ops.triton.rope as ops
-
-    ops.rope_cached_thd_positions_2c_fwd_inplace(
-        query,
-        key,
-        cos,
-        sin,
-        positions,
-        rotate_style,
-        reuse_freqs_front_part=True,
-        nope_first=is_nope_first,
-    )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-if is_rocm_triton_rotary_embedding_enabled():
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_with_key_forward_triton",
-        op_func=rocm_aiter_rotary_emb_with_key_forward_triton_impl,
-        mutates_args=["key", "query"],
-        fake_impl=rocm_aiter_rotary_emb_with_key_forward_triton_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-
-def rocm_aiter_rotary_emb(
-    positions: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    cos_sin_cache: torch.Tensor,
-    head_size: int,
-    rotary_dim: int,
-    is_neox_style: bool,
-):
-    num_tokens = positions.numel()
-    cos, sin = cos_sin_cache.chunk(2, dim=-1)
-    query_shape = query.shape
-    key_shape = key.shape
-    rotate_style = 0 if is_neox_style else 1
-
-    query = query.view(num_tokens, -1, head_size)
-    key = key.view(num_tokens, -1, head_size)
-    query_ = query[..., :rotary_dim]
-    key_ = key[..., :rotary_dim]
-    positions = positions.view(*query.shape[:1])
-    torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_triton(
-        positions,
-        sin,
-        cos,
-        query_,
-        key_,
-        rotate_style,
-        False,
-    )
-    query = query.view(query_shape)
-    key = key.view(key_shape)
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
index 93c92e7801e13..ff46ad74b302e 100644
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -27,6 +27,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         attn_factor: float = 1,
         beta_fast: int = 32,
         beta_slow: int = 1,
+        apply_yarn_scaling: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -34,7 +35,11 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
         # Get n-d magnitude scaling corrected for interpolation
-        self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+        self.mscale = (
+            float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+            if apply_yarn_scaling
+            else float(attn_factor)
+        )
         super().__init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 925f9ac0a16ea..b17bdd0b72078 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -103,12 +103,42 @@ def default_unquantized_gemm(
     return torch.nn.functional.linear(x, weight, bias)
 
 
+def use_aiter_triton_gemm(n, m, k, dtype):
+    if (
+        envs.VLLM_ROCM_USE_AITER == 0
+        or envs.VLLM_ROCM_USE_AITER_TRITON_GEMM == 0
+        # MI300's - fp8nuz=True
+        or current_platform.is_fp8_fnuz()
+        or dtype not in [torch.float16, torch.bfloat16]
+    ):
+        return False
+
+    # use hipblaslt for the larger GEMMs
+    if n > 2048 and m > 512:
+        return False
+    return (
+        (m == 5120 and k == 2880)
+        or (m == 2880 and k == 4096)
+        or (m == 128 and k == 2880)
+        or (m == 640 and k == 2880)
+        or (m == 2880 and k == 512)
+    )
+
+
 def rocm_unquantized_gemm_impl(
     x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
 ) -> torch.Tensor:
     from vllm.platforms.rocm import on_gfx9
 
+    n = x.numel() / x.size(-1)
+    m = weight.shape[0]
     k = weight.shape[1]
+
+    if use_aiter_triton_gemm(n, m, k, x.dtype):
+        from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+
+        return gemm_a16w16(x, weight, bias)
+
     use_skinny = (
         envs.VLLM_ROCM_USE_SKINNY_GEMM
         and on_gfx9()
@@ -120,11 +150,8 @@ def rocm_unquantized_gemm_impl(
         return torch.nn.functional.linear(x, weight, bias)
 
     x_view = x.reshape(-1, x.size(-1))
-    n = x_view.shape[0]
-    m = weight.shape[0]
-    cu_count = current_platform.get_cu_count()
-
     if m > 8 and 0 < n <= 4:
+        cu_count = current_platform.get_cu_count()
         out = ops.wvSplitK(weight, x_view, cu_count, bias)
         return out.reshape(*x.shape[:-1], weight.shape[0])
     elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
@@ -133,7 +160,7 @@ def rocm_unquantized_gemm_impl(
     return torch.nn.functional.linear(x, weight, bias)
 
 
-def rocm_unquantized_gemm_impl_fake(
+def rocm_unquantized_gemm_fake(
     x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
 ) -> torch.Tensor:
     return x.new_empty((*x.shape[:-1], weight.shape[0]))
@@ -145,13 +172,13 @@ def rocm_unquantized_gemm(
     weight: torch.Tensor,
     bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    return torch.ops.vllm.rocm_unquantized_gemm_impl(x, weight, bias)
+    return torch.ops.vllm.rocm_unquantized_gemm(x, weight, bias)
 
 
 direct_register_custom_op(
-    op_name="rocm_unquantized_gemm_impl",
+    op_name="rocm_unquantized_gemm",
     op_func=rocm_unquantized_gemm_impl,
-    fake_impl=rocm_unquantized_gemm_impl_fake,
+    fake_impl=rocm_unquantized_gemm_fake,
 )
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 06b4f9271b41b..e4e530f0cea88 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -726,8 +726,6 @@ def tensorize_vllm_model(
         ) as stream:
             stream.write(encryption_params.key)
 
-    assert envs.VLLM_USE_V1
-
     from vllm.v1.engine.llm_engine import LLMEngine
 
     engine = LLMEngine.from_vllm_config(engine_config)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 7990024c55d0c..f742090df71fd 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -186,15 +186,21 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T:
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             raise NotImplementedError
 
-        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        def load_weights(
+            self,
+            weights: Iterable[tuple[str, torch.Tensor]],
+            load_lm_head: bool = False,
+        ):
             # TODO: Support uninitialized params tracking
 
-            # We have deleted this attribute, so don't load it
-            weights = (
-                (name, data)
-                for name, data in weights
-                if not name.startswith("lm_head.")
-            )
+            # For most pooling models: We have deleted this attribute, so don't load it.
+            # For converting an LLM into a seq cls model, we need the lm_head.
+            if not load_lm_head:
+                weights = (
+                    (name, data)
+                    for name, data in weights
+                    if not name.startswith("lm_head.")
+                )
 
             # If `*ForCausalLM` defines `load_weights` on the inner model
             # and there are no other inner modules with parameters,
@@ -431,8 +437,12 @@ def load_weights_using_from_2_way_softmax(
         )
         model.lm_head = model.lm_head.tie_weights(embed_tokens)
 
-    # Skip ModelForSequenceClassification in MRO to avoid infinite recursion
-    loaded_weights = type(model).__mro__[1].load_weights(model, weights)
+    # ModelForPooling is dynamically defined inside the _create_pooling_model_cls
+    # function, so we need use this hacky method to obtain it.
+    pooling_model_cls = next(
+        x for x in type(model).__mro__ if x.__name__ == "ModelForPooling"
+    )
+    loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True)
 
     from vllm.transformers_utils.tokenizer import get_tokenizer
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index e0b6444c91836..bb505219ea17c 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -75,7 +75,11 @@ class ArcticMLP(nn.Module):
         )
 
         self.w13 = MergedColumnParallelLinear(
-            self.hidden_size, [self.ffn_dim] * 2, bias=False, quant_config=quant_config
+            self.hidden_size,
+            [self.ffn_dim] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.w13",
         )
         self.w2 = RowParallelLinear(
             self.ffn_dim,
@@ -83,6 +87,7 @@ class ArcticMLP(nn.Module):
             bias=False,
             reduce_results=reduce_results,
             quant_config=quant_config,
+            prefix=f"{prefix}.w2",
         )
         if config.hidden_act != "silu":
             raise ValueError(
@@ -297,6 +302,7 @@ class ArcticAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
@@ -304,6 +310,7 @@ class ArcticAttention(nn.Module):
             bias=False,
             reduce_results=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index ccf32c9ee1ac7..39990b9fd6837 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -98,13 +98,22 @@ class BaiChuanMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -152,12 +161,14 @@ class BaiChuanAttention(nn.Module):
             self.total_num_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.W_pack",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         # Create the alibi slopes and slice them.
         if self.position_embedding == "ALIBI":
@@ -235,6 +246,7 @@ class BaiChuanDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 151fb3b6acc46..bc7dbb618f65c 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -60,6 +60,7 @@ class BambaMLP(nn.Module):
         config: BambaConfig,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -67,12 +68,14 @@ class BambaMLP(nn.Module):
             output_sizes=[config.intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if config.hidden_act != "silu":
             raise ValueError(
@@ -118,7 +121,9 @@ class BambaMixerDecoderLayer(nn.Module):
             prefix=f"{prefix}.mixer",
         )
 
-        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.feed_forward = BambaMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -202,12 +207,14 @@ class BambaAttentionDecoderLayer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.attn = Attention(
@@ -219,7 +226,9 @@ class BambaAttentionDecoderLayer(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
-        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.feed_forward = BambaMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index bbbd14adf92b2..18b09ee43b7b0 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -108,12 +108,14 @@ class BloomAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         # Create the alibi slopes and slice them.
@@ -152,6 +154,7 @@ class BloomMLP(nn.Module):
         self,
         config: BloomConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -159,12 +162,14 @@ class BloomMLP(nn.Module):
             hidden_size,
             4 * hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.gelu_impl = get_act_fn("gelu")
         self.dense_4h_to_h = RowParallelLinear(
             4 * hidden_size,
             hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -192,7 +197,7 @@ class BloomBlock(nn.Module):
         self.post_attention_layernorm = nn.LayerNorm(
             hidden_size, eps=config.layer_norm_epsilon
         )
-        self.mlp = BloomMLP(config, quant_config)
+        self.mlp = BloomMLP(config, quant_config, prefix=f"{prefix}.mlp")
         self.apply_residual_connection_post_layernorm = (
             config.apply_residual_connection_post_layernorm
         )
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 6f7e18d78bada..54ff6991fa702 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -227,6 +227,7 @@ class ChameleonMLP(nn.Module):
         hidden_act: str,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -234,12 +235,14 @@ class ChameleonMLP(nn.Module):
             output_sizes=[intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=intermediate_size,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -299,12 +302,14 @@ class ChameleonAttention(nn.Module):
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
         self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
@@ -393,6 +398,7 @@ class ChameleonDecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -462,6 +468,7 @@ class ChameleonSwinDecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 7150977e9266b..66b246878b0aa 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -257,9 +258,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
         if structured_outputs_config.reasoning_parser == "":
             structured_outputs_config.reasoning_parser = "openai_gptoss"
 
-        # Increase the max capture size from 512 to 992 for performance.
+        # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
-        # from 67 to 81.
+        # from 67 to 83.
         compilation_config = vllm_config.compilation_config
         # Only override when the user has not set either of
         # cudagraph_capture_sizes or max_cudagraph_capture_size.
@@ -267,11 +268,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
             compilation_config.cudagraph_capture_sizes is None
             and compilation_config.max_cudagraph_capture_size is None
         ):
-            # FIXME(woosuk): When using full cuda graph with FA3, the max
-            # supported size is 992.
-            compilation_config.max_cudagraph_capture_size = 992
+            compilation_config.max_cudagraph_capture_size = 1024
             logger.info(
-                "Overriding max cuda graph capture size to %d for performance.", 992
+                "Overriding max cuda graph capture size to %d for performance.", 1024
             )
 
 
@@ -285,10 +284,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         Args:
             vllm_config: vLLM Config
         """
-
-        if not envs.VLLM_USE_V1:
-            return
-
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
 
@@ -299,7 +294,7 @@ class MambaModelConfig(VerifyAndUpdateConfig):
             if model_config.supports_mamba_prefix_caching:
                 logger.info(
                     "Warning: Prefix caching is currently enabled. "
-                    "Its support for Mamba2 layers is experimental. "
+                    "Its support for Mamba layers is experimental. "
                     "Please report any issues you may observe."
                 )
             else:
@@ -329,10 +324,6 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         Args:
             vllm_config: vLLM Config
         """
-
-        if not envs.VLLM_USE_V1:
-            return
-
         # Save the user input before it gets modified by MambaModelConfig
         mamba_block_size = vllm_config.cache_config.mamba_block_size
         # Enable FULL_AND_PIECEWISE by default
@@ -364,6 +355,17 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
             ).page_size_bytes
         else:
             kernel_block_alignment_size = 16
+            if (
+                current_platform.is_device_capability(100)
+                and model_config.get_head_size() == 256
+                and (
+                    envs.VLLM_ATTENTION_BACKEND is None
+                    or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
+                )
+            ):
+                # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
+                # head size 256 and block size 16 is not supported on blackwell.
+                kernel_block_alignment_size = 32
             attn_page_size_1_token = FullAttentionSpec(
                 block_size=1,
                 num_kv_heads=model_config.get_num_kv_heads(parallel_config),
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 088960e064489..22095d05848ce 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -209,12 +209,14 @@ class DbrxAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.Wqkv",
         )
         self.out_proj = RowParallelLinear(
             self.d_model,
             self.d_model,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
deleted file mode 100644
index ac934abea45df..0000000000000
--- a/vllm/model_executor/models/deepseek.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Deepseek model."""
-
-from collections.abc import Iterable
-from itertools import islice
-from typing import Any
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (
-    get_pp_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
-)
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead,
-    VocabParallelEmbedding,
-)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (
-    AutoWeightsLoader,
-    extract_layer_index,
-    is_pp_missing_parameter,
-    make_empty_intermediate_tensors_factory,
-    make_layers,
-    maybe_prefix,
-)
-
-
-class DeepseekMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: QuantizationConfig | None = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
-        )
-        self.down_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-            reduce_results=reduce_results,
-        )
-        if hidden_act != "silu":
-            raise ValueError(
-                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
-            )
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class DeepseekMoE(nn.Module):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.n_routed_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.n_routed_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}."
-            )
-
-        self.experts = nn.ModuleList(
-            [
-                DeepseekMLP(
-                    hidden_size=config.hidden_size,
-                    intermediate_size=config.moe_intermediate_size,
-                    hidden_act=config.hidden_act,
-                    quant_config=quant_config,
-                    reduce_results=False,
-                )
-                for idx in range(self.n_routed_experts)
-            ]
-        )
-        self.pack_params()
-
-        self.gate = ReplicatedLinear(
-            config.hidden_size, self.n_routed_experts, bias=False, quant_config=None
-        )
-
-        if config.n_shared_experts is not None:
-            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
-            self.shared_experts = DeepseekMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                reduce_results=False,
-            )
-
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.config.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        topk_weights, topk_ids, _ = fused_topk(
-            hidden_states,
-            router_logits,
-            self.top_k,
-            renormalize=self.config.norm_topk_prob,
-        )
-
-        final_hidden_states = fused_experts(
-            hidden_states, self.w1, self.w2, topk_weights, topk_ids, inplace=True
-        )
-
-        if self.config.n_shared_experts is not None:
-            final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_dim)
-
-
-class DeepseekAttention(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
-        max_position_embeddings: int = 8192,
-        cache_config: CacheConfig | None = None,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            quant_config=quant_config,
-        )
-
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = Attention(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            num_kv_heads=self.num_kv_heads,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-        )
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class DeepseekDecoderLayer(nn.Module):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: CacheConfig | None = None,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        layer_idx = extract_layer_index(prefix)
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        self.self_attn = DeepseekAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        if (
-            config.n_routed_experts is not None
-            and layer_idx >= config.first_k_dense_replace
-            and layer_idx % config.moe_layer_freq == 0
-        ):
-            self.mlp = DeepseekMoE(
-                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
-            )
-        else:
-            self.mlp = DeepseekMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor | None,
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class DeepseekModel(nn.Module):
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: DeepseekDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
-            ),
-            prefix=f"{prefix}.layers",
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
-            ["hidden_states", "residual"], config.hidden_size
-        )
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors(
-                {"hidden_states": hidden_states, "residual": residual}
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (
-                    "mlp.experts." in name or "mlp.shared_experts." in name
-                ) and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (
-                    "mlp.experts." in name or "mlp.shared_experts." in name
-                ) and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = DeepseekModel(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-        self.lm_head = ParallelLMHead(
-            config.vocab_size,
-            config.hidden_size,
-            quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "lm_head"),
-        )
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 107b1e1a05823..fd2f20ea501d0 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -24,9 +24,12 @@ from vllm.model_executor.models.deepseek_v2 import (
     DeepseekV2DecoderLayer,
     DeepseekV3ForCausalLM,
 )
+from vllm.utils import init_logger
 
 from .utils import AutoWeightsLoader, maybe_prefix
 
+logger = init_logger(__name__)
+
 
 @support_torch_compile
 class DeepseekV2Model(nn.Module):
@@ -215,6 +218,10 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
             self.config.vocab_size, scale=logit_scale
         )
 
+        # Set MoE hyperparameters
+        self.num_moe_layers = self.config.num_hidden_layers
+        self.set_moe_parameters()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 3984d23970ac5..26b9c25e6bdb5 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -8,6 +8,7 @@ from transformers import PretrainedConfig
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -25,11 +26,15 @@ from vllm.sequence import IntermediateTensors
 
 from .deepseek_v2 import (
     DeepseekV2DecoderLayer,
+    DeepseekV2MixtureOfExperts,
+    DeepseekV2MoE,
     get_spec_layer_idx_from_weight_name,
 )
 from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
+logger = init_logger(__name__)
+
 
 class SharedHead(nn.Module):
     def __init__(
@@ -119,6 +124,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = config.num_nextn_predict_layers
         # to map the exact layer index from weights
+
         self.layers = torch.nn.ModuleDict(
             {
                 str(idx): DeepSeekMultiTokenPredictorLayer(
@@ -172,13 +178,33 @@ class DeepSeekMultiTokenPredictor(nn.Module):
 
 
 @support_torch_compile
-class DeepSeekMTP(nn.Module, SupportsPP):
+class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
         self.model = DeepSeekMultiTokenPredictor(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+        self.num_moe_layers = self.config.num_nextn_predict_layers
+        self.num_expert_groups = self.config.n_group
+
+        self.moe_layers = []
+        self.moe_mlp_layers = []
+        example_moe = None
+        for layer in self.model.layers.values():
+            assert isinstance(layer, DeepSeekMultiTokenPredictorLayer)
+            layer = layer.mtp_block
+            assert isinstance(layer, DeepseekV2DecoderLayer)
+            if isinstance(layer.mlp, DeepseekV2MoE):
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+        self.extract_moe_parameters(example_moe)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index fa24db456af4d..0432567521843 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -131,10 +131,34 @@ class NGramPerReqLogitsProcessor(AdapterLogitsProcessor):
     """Example of overriding the wrapper class `__init__()` in order to utilize
     info about the device type"""
 
-    def __init__(
-        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
-    ):
-        super().__init__(vllm_config, device, is_pin_memory)
+    @classmethod
+    def validate_params(cls, params: SamplingParams):
+        ngram_size = params.extra_args and params.extra_args.get("ngram_size")
+        window_size = params.extra_args and params.extra_args.get("window_size", 100)
+        whitelist_token_ids = params.extra_args and params.extra_args.get(
+            "whitelist_token_ids", None
+        )
+        # if ngram_size is not provided, skip validation because the processor
+        # will not be used.
+        if ngram_size is None:
+            return None
+
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f"`ngram_size` has to be a strictly positive integer, got {ngram_size}."
+            )
+        if not isinstance(window_size, int) or window_size <= 0:
+            raise ValueError(
+                "`window_size` has to be a strictly positive integer, "
+                f"got {window_size}."
+            )
+        if whitelist_token_ids is not None and not isinstance(
+            whitelist_token_ids, Iterable
+        ):
+            raise ValueError(
+                "`whitelist_token_ids` has to be a sequence of integers, "
+                f"got {whitelist_token_ids}."
+            )
 
     def is_argmax_invariant(self) -> bool:
         return True
@@ -150,26 +174,8 @@ class NGramPerReqLogitsProcessor(AdapterLogitsProcessor):
         )
         if ngram_size is None:
             return None
-        if not isinstance(ngram_size, int) or ngram_size <= 0:
-            raise ValueError(
-                f"`ngram_size` has to be a strictly positive integer, got {ngram_size}."
-            )
-        if not isinstance(window_size, int) or window_size <= 0:
-            raise ValueError(
-                "`window_size` has to be a strictly positive integer, "
-                f"got {window_size}."
-            )
-        if whitelist_token_ids is not None and not isinstance(
-            whitelist_token_ids, Iterable
-        ):
-            raise ValueError(
-                "`whitelist_token_ids` has to be a set of integers, "
-                f"got {whitelist_token_ids}."
-            )
-        else:
-            whitelist_token_ids = (
-                set(whitelist_token_ids) if whitelist_token_ids else None
-            )
+
+        whitelist_token_ids = set(whitelist_token_ids) if whitelist_token_ids else None
         return NoRepeatNGramLogitsProcessor(
             ngram_size=ngram_size,
             window_size=window_size,
@@ -411,18 +417,10 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
             )
 
-        if self.text_config.topk_method == "noaux_tc":
-            architectures = ["DeepseekV3ForCausalLM"]
-        elif not self.text_config.use_mla:
-            architectures = ["DeepseekForCausalLM"]
-        else:
-            architectures = ["DeepseekV2ForCausalLM"]
-
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
-            architectures=architectures,
         )
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index db7b86ffaf962..38189e17f7d8b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -33,6 +33,7 @@ import torch
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
@@ -50,14 +51,11 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_fusion_shared_expert_enabled,
-    is_rocm_aiter_moe_enabled,
-)
 from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
+    QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
@@ -104,6 +102,92 @@ elif current_platform.is_xpu():
 logger = init_logger(__name__)
 
 
+class DeepseekAttention(nn.Module):
+    """Normal MHA implementation used by Deepseek v1."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: DeepseekV2Config | DeepseekV3Config,
+        hidden_size: int,
+        num_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
 class DeepseekV2MLP(nn.Module):
     def __init__(
         self,
@@ -163,10 +247,10 @@ class DeepseekV2MoE(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
 
-        self.routed_scaling_factor = config.routed_scaling_factor
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
@@ -186,7 +270,7 @@ class DeepseekV2MoE(nn.Module):
             quant_config=None,
             prefix=f"{prefix}.gate",
         )
-        if config.topk_method == "noaux_tc":
+        if getattr(config, "topk_method", None) == "noaux_tc":
             self.gate.e_score_correction_bias = nn.Parameter(
                 torch.empty(config.n_routed_experts, dtype=torch.float32)
             )
@@ -207,10 +291,8 @@ class DeepseekV2MoE(nn.Module):
             self.physical_expert_start + self.n_local_physical_experts
         )
 
-        if (
-            config.n_shared_experts is None
-            or is_rocm_aiter_fusion_shared_expert_enabled()
-        ):
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
             self.shared_experts = None
         else:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
@@ -236,21 +318,21 @@ class DeepseekV2MoE(nn.Module):
             renormalize=config.norm_topk_prob,
             quant_config=quant_config,
             use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            topk_group=config.topk_group,
+            num_expert_group=getattr(config, "n_group", 1),
+            topk_group=getattr(config, "topk_group", 1),
             prefix=f"{prefix}.experts",
-            scoring_func=config.scoring_func,
+            scoring_func=getattr(config, "scoring_func", "softmax"),
             # we do scaling outside, set factor to 1.0 to avoid double mul
             # aiter applies routed_scaling_factor internally
             routed_scaling_factor=1.0
-            if not is_rocm_aiter_moe_enabled()
+            if not self.is_rocm_aiter_moe_enabled
             else self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
-            if is_rocm_aiter_fusion_shared_expert_enabled()
+            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
             else None,
         )
 
@@ -284,7 +366,7 @@ class DeepseekV2MoE(nn.Module):
         # Fix FP16 overflow
         # See DeepseekV2DecoderLayer for more details.
         if hidden_states.dtype != torch.float16:
-            if not is_rocm_aiter_moe_enabled():
+            if not self.is_rocm_aiter_moe_enabled:
                 final_hidden_states *= self.routed_scaling_factor
         elif self.shared_experts is not None:
             assert shared_output is not None
@@ -994,11 +1076,24 @@ class DeepseekV2DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        moe_layer_freq = getattr(config, "moe_layer_freq", 1)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
-        if model_config.use_mla:
+
+        # verify MLA attention specific fields
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        v_head_dim = getattr(config, "v_head_dim", 0)
+        kv_lora_rank = getattr(config, "kv_lora_rank", 0)
+        use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+
+        if use_mha:
+            attn_cls = DeepseekAttention
+        elif model_config.use_mla:
             attn_cls = DeepseekV2MLAAttention
         else:
             attn_cls = DeepseekV2Attention
@@ -1007,11 +1102,11 @@ class DeepseekV2DecoderLayer(nn.Module):
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            qk_nope_head_dim=config.qk_nope_head_dim,
-            qk_rope_head_dim=config.qk_rope_head_dim,
-            v_head_dim=config.v_head_dim,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
             q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
-            kv_lora_rank=config.kv_lora_rank,
+            kv_lora_rank=kv_lora_rank,
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -1024,7 +1119,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         if (
             config.n_routed_experts is not None
             and layer_idx >= config.first_k_dense_replace
-            and layer_idx % config.moe_layer_freq == 0
+            and layer_idx % moe_layer_freq == 0
         ):
             self.mlp = DeepseekV2MoE(
                 config=config,
@@ -1044,7 +1139,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         self.post_attention_layernorm = RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
-        self.routed_scaling_factor = config.routed_scaling_factor
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
 
     def forward(
         self,
@@ -1063,7 +1158,10 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states=hidden_states,
         )
 
-        if hidden_states.dtype == torch.float16:
+        if (
+            not isinstance(self.self_attn, DeepseekAttention)
+            and hidden_states.dtype == torch.float16
+        ):
             # Fix FP16 overflow
             # We scale both hidden_states and residual before
             # rmsnorm, and rmsnorm result would not affect by scale.
@@ -1122,7 +1220,6 @@ class DeepseekV2Model(nn.Module):
             )
         else:
             self.embed_tokens = PPMissingLayer()
-
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: DeepseekV2DecoderLayer(
@@ -1172,7 +1269,50 @@ class DeepseekV2Model(nn.Module):
         return hidden_states
 
 
-class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoRA):
+class DeepseekV2MixtureOfExperts(MixtureOfExperts):
+    moe_mlp_layers: list[DeepseekV2MoE]
+    """
+    List of MoE MLP layers in the model.
+    """
+
+    def extract_moe_parameters(self, example_moe: DeepseekV2MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class DeepseekV2ForCausalLM(
+    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA
+):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
@@ -1184,6 +1324,15 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
         self.config = config
         self.quant_config = quant_config
 
+        qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 0)
+        qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 0)
+        self.use_mha = config.model_type == "deepseek" or all(
+            dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
+        )
+
+        if self.use_mha:
+            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+
         # `packed_modules_mapping` needs to be modified before
         # initializing DeepseekV2Model, as it is passed inplace to
         # quantization config init and may be used to select the
@@ -1213,13 +1362,19 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
+        # Set MoE hyperparameters
+        self.num_moe_layers = (
+            self.config.num_hidden_layers - self.config.first_k_dense_replace
+        )
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
         self.expert_weights = []
 
-        # Set MoE hyperparameters
-        self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace
-        self.num_expert_groups = config.n_group
+        self.num_expert_groups = getattr(self.config, "n_group", 1)
 
-        self.moe_layers: list[SharedFusedMoE] = []
+        self.moe_layers = []
+        self.moe_mlp_layers = []
         example_moe = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -1229,50 +1384,10 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
             if isinstance(layer.mlp, DeepseekV2MoE):
                 # Pick last one layer since the first ones may be dense layers.
                 example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
                 self.moe_layers.append(layer.mlp.experts)
 
-        if example_moe is None:
-            raise RuntimeError("No DeepseekV2MoE layer found in model.layers.")
-
-        self.num_logical_experts = example_moe.n_logical_experts
-        self.num_physical_experts = example_moe.n_physical_experts
-        self.num_local_physical_experts = example_moe.n_local_physical_experts
-        self.num_routed_experts = example_moe.n_routed_experts
-        self.num_shared_experts = example_moe.n_shared_experts
-        self.num_redundant_experts = example_moe.n_redundant_experts
-
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for layer in self.model.layers:
-            if isinstance(layer.mlp, DeepseekV2MoE):
-                moe = layer.mlp
-                moe.n_local_physical_experts = num_local_physical_experts
-                moe.n_physical_experts = num_physical_experts
-                moe.n_redundant_experts = self.num_redundant_experts
-                moe.experts.update_expert_map()
+        self.extract_moe_parameters(example_moe)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -1308,13 +1423,27 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
         )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
+        ]
+        mla_params_mapping = [
             ("fused_qkv_a_proj", "q_a_proj", 0),
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
+        mha_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        if self.use_mha:
+            stacked_params_mapping.extend(mha_params_mapping)
+        else:
+            stacked_params_mapping.extend(mla_params_mapping)
 
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
@@ -1325,7 +1454,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
             num_experts=self.config.n_routed_experts
             + (
                 self.config.n_shared_experts
-                if is_rocm_aiter_fusion_shared_expert_enabled()
+                if rocm_aiter_moe_shared_expert_enabled
                 else 0
             ),
             num_redundant_experts=self.num_redundant_experts,
@@ -1341,9 +1470,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = (
-                is_rocm_aiter_fusion_shared_expert_enabled()
-                and ("mlp.shared_experts" in name)
+            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
+                "mlp.shared_experts" in name
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1497,6 +1625,10 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
         return loaded_params
 
 
+class DeepseekForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
     pass
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index ea10245a84ee1..306eef3dca990 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -403,18 +403,10 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
             )
 
-        if self.text_config.topk_method == "noaux_tc":
-            architectures = ["DeepseekV3ForCausalLM"]
-        elif not self.text_config.use_mla:
-            architectures = ["DeepseekForCausalLM"]
-        else:
-            architectures = ["DeepseekV2ForCausalLM"]
-
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.text_config,
             prefix=maybe_prefix(prefix, "language"),
-            architectures=architectures,
         )
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index c33cb3d84478e..15caa3184581d 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -240,6 +240,7 @@ class Dots1Attention(nn.Module):
             self.total_num_kv_heads,
             bias=attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
 
         self.o_proj = RowParallelLinear(
@@ -247,6 +248,7 @@ class Dots1Attention(nn.Module):
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py
index b1d26cddcc5eb..c1a4737e1f326 100644
--- a/vllm/model_executor/models/ernie45.py
+++ b/vllm/model_executor/models/ernie45.py
@@ -23,12 +23,22 @@
 # limitations under the License.
 """Inference-only Erine model compatible with HuggingFace weights."""
 
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
 from .utils import PPMissingLayer
 
 
+@support_torch_compile(
+    # set dynamic_arg_dims to support mrope
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class Ernie4_5ForCausalLM(LlamaForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 192ca05852304..b35666175ea7b 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -133,7 +133,7 @@ class Ernie4_5_MoeMoE(nn.Module):
 
         self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None)
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.moe_num_experts
         self.n_shared_experts: int = self.moe_num_shared_experts
@@ -709,22 +709,6 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
             self.num_shared_experts = example_moe.n_shared_experts
             self.num_redundant_experts = example_moe.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 86536b21c33fc..f287cff12086b 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1367,6 +1367,23 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors
         )
+        if getattr(self.config, "im_patch_id", None):
+            visual_token_ids = [
+                token_id
+                for token_id in [
+                    self.config.im_patch_id,
+                    getattr(self.config, "image_start_token_id", None),
+                    getattr(self.config, "image_end_token_id", None),
+                    getattr(self.config, "video_start_token_id", None),
+                    getattr(self.config, "video_end_token_id", None),
+                ]
+                if token_id is not None
+            ]
+            self._visual_token_ids_tensor_cache = torch.tensor(
+                visual_token_ids, dtype=torch.long
+            )
+        else:
+            self._visual_token_ids_tensor_cache = None
 
     def compute_logits(
         self,
@@ -1398,12 +1415,19 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         return image_features
 
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
-        if getattr(self.config, "im_patch_id", None) is not None:
-            self.visual_token_mask = (input_ids == self.config.im_patch_id).reshape(
-                -1, 1
-            )
-        else:
+        """Set mask for visual tokens (image/video patches and delimiters)."""
+        if self._visual_token_ids_tensor_cache is None:
             self.visual_token_mask = None
+            return
+        # Create tensor on the correct device
+        visual_token_ids_tensor = self._visual_token_ids_tensor_cache.to(
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
+
+        self.visual_token_mask = torch.isin(input_ids, visual_token_ids_tensor).reshape(
+            -1, 1
+        )
 
     def get_mrope_input_positions(
         self,
@@ -1411,8 +1435,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         hf_config: PretrainedConfig,
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
         second_per_grid_ts: list[float] | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
@@ -1545,7 +1567,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 25429836b9ed6..1b9c7da334909 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -137,6 +137,7 @@ class FalconAttention(nn.Module):
             bias=config.bias,
             skip_bias_add=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
@@ -153,6 +154,7 @@ class FalconAttention(nn.Module):
             skip_bias_add=True,
             quant_config=quant_config,
             reduce_results=self.reduce_row_parallel_results,
+            prefix=f"{prefix}.dense",
         )
 
         self.use_rotary = config.rotary
@@ -227,6 +229,7 @@ class FalconMLP(nn.Module):
         self,
         config: FalconConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -237,6 +240,7 @@ class FalconMLP(nn.Module):
             bias=config.bias,
             skip_bias_add=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.act = get_act_fn("gelu")
         self.reduce_row_parallel_results = not (
@@ -249,6 +253,7 @@ class FalconMLP(nn.Module):
             skip_bias_add=True,
             reduce_results=self.reduce_row_parallel_results,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -275,7 +280,7 @@ class FalconDecoderLayer(nn.Module):
         self.self_attention = FalconAttention(
             config, cache_config, quant_config, prefix=f"{prefix}.self_attention"
         )
-        self.mlp = FalconMLP(config, quant_config)
+        self.mlp = FalconMLP(config, quant_config, prefix=f"{prefix}.mlp")
         self.config = config
 
         if not hasattr(config, "num_ln_in_parallel_attn"):
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 8bf700b474a41..ac5846cfd8695 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -59,6 +59,7 @@ class FalconH1MLP(nn.Module):
         config: FalconH1Config,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -66,12 +67,14 @@ class FalconH1MLP(nn.Module):
             output_sizes=[config.intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.intermediate_size = config.intermediate_size
@@ -365,7 +368,7 @@ class FalconH1ParallelHybrid(nn.Module):
         self.attention_in_multiplier = config.attention_in_multiplier
         self.attn_out_multiplier = config.attention_out_multiplier
 
-        self.feed_forward = FalconH1MLP(config)
+        self.feed_forward = FalconH1MLP(config, prefix=f"{prefix}.feed_forward")
 
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 66c9b774f174d..1938efd4895e5 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -66,13 +66,22 @@ class Gemma2MLP(nn.Module):
         hidden_act: str,
         hidden_activation: str,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
             raise ValueError(
@@ -134,12 +143,14 @@ class Gemma2Attention(nn.Module):
             self.total_num_kv_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -208,6 +219,7 @@ class Gemma2DecoderLayer(nn.Module):
             hidden_act=config.hidden_act,
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmaRMSNorm(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 748605b4ed5ac..8e1dbd9e2cea7 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -9,7 +9,6 @@ from torch import nn
 from transformers import BatchFeature, Gemma3Config, Gemma3Processor
 from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
@@ -137,11 +136,10 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         if not do_pan_and_scan:
             return 0
 
-        if envs.VLLM_USE_V1:
-            logger.warning_once(
-                "`do_pan_and_scan=True` has suboptimal results on V1 "
-                "because of the simplified attention pattern being used."
-            )
+        logger.warning_once(
+            "`do_pan_and_scan=True` has suboptimal results on V1 "
+            "because of the simplified attention pattern being used."
+        )
 
         # Based on Gemma3ImageProcessor.pan_and_scan
         if image_width >= image_height:
@@ -403,7 +401,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
 
         def get_repl_toks(tok: int) -> list[int]:
             if tok == newline_3:
-                return [newline_2, newline_1]
+                return [newline_1, newline_2]
             if tok == newline_4:
                 return [newline_2, newline_2]
 
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index f7a732e3a601c..547884f393eb0 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -357,8 +357,27 @@ class Gemma3nAttention(nn.Module):
             offset = 2 if self.sliding_window is not None else 1
             kv_shared_layer_index = first_kv_shared_layer_idx - offset
             if kv_shared_layer_index >= 0:
+                # Different model wrappers expose layer parameters under
+                # different parent attributes.
+                # For example:
+                #   - Gemma3nForCausalLM → parameters live under "model.layers"
+                #   - Gemma3nForConditionalGeneration →
+                #     under "language_model.model.layers"
+                # This logic extracts the portion of the parameter name
+                # *before* ".layers."
+                # so downstream code can consistently reference the correct
+                # model root regardless of which wrapper class was used.
+                if ".layers." in prefix:
+                    param_name_before_layers = prefix.split(".layers.")[0]
+                else:
+                    raise ValueError(
+                        "Unexpected prefix format for Gemma3nAttention: "
+                        f"'{prefix}'. The prefix is expected to contain "
+                        "'.layers.' to correctly determine the KV sharing "
+                        "target layer."
+                    )
                 # Only the greater layer is required to specify sharing.
-                kv_sharing_target_layer_name = f"language_model.model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
+                kv_sharing_target_layer_name = f"{param_name_before_layers}.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 3e243385fd049..b9cd3545ec453 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -370,7 +370,7 @@ class Glm4vVisionAttention(nn.Module):
                 cu_seqlens_k=cu_seqlens,
                 max_seqlen_q=max_seqlen,
                 max_seqlen_k=max_seqlen,
-                dropout_p=0,
+                dropout_p=0.0,
                 causal=False,
             )
 
@@ -1622,8 +1622,6 @@ class Glm4vForConditionalGeneration(
         image_grid_thw: list[list[int]] | torch.Tensor | None,
         video_grid_thw: list[list[int]] | torch.Tensor | None,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -1754,7 +1752,6 @@ class Glm4vForConditionalGeneration(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index a53f52852c6ad..b30bd66161da9 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -62,7 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -127,7 +127,7 @@ class Glm4MoE(nn.Module):
         self.routed_scaling_factor = config.routed_scaling_factor
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
@@ -616,7 +616,35 @@ class Glm4MoeModel(nn.Module):
         return loaded_params
 
 
-class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Glm4MixtureOfExperts(MixtureOfExperts):
+    def extract_moe_parameters(self, example_moe: Glm4MoE | None) -> None:
+        if example_moe is None:
+            raise RuntimeError("No Glm4MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -659,7 +687,9 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace
         self.num_expert_groups = config.n_group
 
-        self.moe_layers: list[SharedFusedMoE] = []
+        self.moe_layers = []
+        self.moe_mlp_layers: list[Glm4MoE] = []
+
         example_moe = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -669,33 +699,10 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             if isinstance(layer.mlp, Glm4MoE):
                 # Pick last one layer since the first ones may be dense layers.
                 example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
                 self.moe_layers.append(layer.mlp.experts)
 
-        if example_moe is None:
-            raise RuntimeError("No Glm4MoE layer found in model.layers.")
-
-        self.num_logical_experts = example_moe.n_logical_experts
-        self.num_physical_experts = example_moe.n_physical_experts
-        self.num_local_physical_experts = example_moe.n_local_physical_experts
-        self.num_routed_experts = example_moe.n_routed_experts
-        self.num_shared_experts = example_moe.n_shared_experts
-        self.num_redundant_experts = example_moe.n_redundant_experts
-
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
+        self.extract_moe_parameters(example_moe)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 9fb1be7ba45c4..9a2ae3c476f07 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -29,7 +29,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -41,7 +41,12 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name
+from .glm4_moe import (
+    Glm4MixtureOfExperts,
+    Glm4MoE,
+    Glm4MoeDecoderLayer,
+    get_spec_layer_idx_from_weight_name,
+)
 from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
@@ -73,6 +78,7 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
         prefix: str,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
+        parallel_config: ParallelConfig | None = None,
     ) -> None:
         super().__init__()
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -81,11 +87,13 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
         self.shared_head = SharedHead(
             config=config, prefix=prefix, quant_config=quant_config
         )
+        self.enable_eplb = parallel_config.enable_eplb
         self.mtp_block = Glm4MoeDecoderLayer(
             config=config,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=prefix,
+            enable_eplb=self.enable_eplb,
         )
 
     def forward(
@@ -127,6 +135,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
                     f"{prefix}.layers.{idx}",
                     cache_config=vllm_config.cache_config,
                     quant_config=vllm_config.quant_config,
+                    parallel_config=vllm_config.parallel_config,
                 )
                 for idx in range(
                     self.mtp_start_layer_idx,
@@ -175,7 +184,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
         return logits
 
 
-class Glm4MoeMTP(nn.Module, SupportsPP):
+class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -183,6 +192,25 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = self.config.num_nextn_predict_layers
+        self.num_expert_groups = self.config.n_group
+
+        self.moe_layers: list[FusedMoE] = []
+        self.moe_mlp_layers: list[Glm4MoE] = []
+        example_moe = None
+        for layer in self.model.layers.values():
+            assert isinstance(layer, Glm4MoeMultiTokenPredictorLayer)
+            layer = layer.mtp_block
+            assert isinstance(layer, Glm4MoeDecoderLayer)
+            if isinstance(layer.mlp, Glm4MoE):
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+        self.extract_moe_parameters(example_moe)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 2de1e48109521..ebf6934dddead 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -625,8 +625,6 @@ class GLM4VForCausalLM(
         hf_config: PretrainedConfig,
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
         second_per_grid_ts: list[float] | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
@@ -758,7 +756,6 @@ class GLM4VForCausalLM(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 1777fd3583c39..e04b2465e54ae 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -78,12 +78,14 @@ class GPTJAttention(nn.Module):
             self.total_num_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.out_proj = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
 
         tp_world_size = get_tensor_model_parallel_world_size()
@@ -130,6 +132,7 @@ class GPTJMLP(nn.Module):
         intermediate_size: int,
         config: GPTJConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.n_embd
@@ -137,11 +140,13 @@ class GPTJMLP(nn.Module):
             hidden_size,
             intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc_in",
         )
         self.fc_out = RowParallelLinear(
             intermediate_size,
             hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc_out",
         )
         self.act = get_act_fn(config.activation_function)
 
@@ -166,7 +171,7 @@ class GPTJBlock(nn.Module):
         self.attn = GPTJAttention(
             config, cache_config, quant_config, prefix=f"{prefix}.attn"
         )
-        self.mlp = GPTJMLP(inner_dim, config, quant_config)
+        self.mlp = GPTJMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 2f638acaa2b66..e6c145602d29a 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -80,12 +80,14 @@ class GPTNeoXAttention(nn.Module):
             self.total_num_heads,
             bias=self.bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             bias=self.bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
         scaling = self.head_size**-0.5
         rotary_dim = int(self.head_size * config.rotary_pct)
@@ -125,17 +127,20 @@ class GPTNeoXMLP(nn.Module):
         self,
         config: GPTNeoXConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.dense_h_to_4h = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.dense_4h_to_h = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
         self.act = get_act_fn(config.hidden_act)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 44f6824b52129..04038ae74882d 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -25,12 +25,14 @@ from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLine
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.utils import rocm_unquantized_gemm
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
 
@@ -153,6 +155,7 @@ class MLPBlock(torch.nn.Module):
 
         self.layer_idx = layer_idx
         self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
         self.router = torch.nn.Linear(config.hidden_size, config.num_local_experts)
@@ -177,7 +180,12 @@ class MLPBlock(torch.nn.Module):
         if self.is_sequence_parallel:
             x = sequence_parallel_chunk(x)
 
-        g = self.router(x)
+        if current_platform.is_rocm():
+            g = rocm_unquantized_gemm(
+                self, x[:, : self.hidden_size], self.router.weight, self.router.bias
+            )
+        else:
+            g = self.router(x)
         x = self.experts(hidden_states=x, router_logits=g)
 
         if self.is_sequence_parallel:
@@ -329,9 +337,6 @@ class GptOssModel(nn.Module):
             if is_pp_missing_parameter(name, self):
                 continue
 
-            # FIXME(woosuk): Remove this after testing.
-            weight = weight.cuda()
-
             if ".w13_weight_scale" in name:
                 # Handle MLP gate and up projection weights scale
                 if use_ep:
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 043b1406bd371..3ddf02bbba2ea 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -26,15 +26,17 @@
 
 import math
 from collections.abc import Iterable, Mapping
-from typing import Annotated
+from typing import Annotated, Literal, cast
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
+from vllm.inputs.data import PromptType
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -57,6 +59,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.tokenizer import cached_get_tokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip2 import Blip2QFormerModel
@@ -65,9 +69,22 @@ from .interfaces import (
     SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
+    SupportsTranscription,
 )
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
+# NOTE lang support is based on what is written here:
+# https://huggingface.co/ibm-granite/granite-speech-3.3-2b
+# Though this may vary from model to model, and also many langs
+# work pretty well with zero shot.
+ISO639_1_SUPPORTED_LANGS = {
+    "en": "English",
+    "fr": "French",
+    "de": "German",
+    "pt": "Portuguese",
+    "es": "Spanish",
+}
+
 
 ### Audio Input
 class GraniteSpeechAudioInputs(TensorSchema):
@@ -545,8 +562,10 @@ class GraniteSpeechForConditionalGeneration(
     SupportsMultiModal,
     SupportsPP,
     SupportsLoRA,
+    SupportsTranscription,
 ):
     merge_by_field_config = True
+    supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
         "qkv_proj": [
@@ -816,3 +835,79 @@ class GraniteSpeechForConditionalGeneration(
             connector="projector",
             tower_model="encoder",
         )
+
+    ### Support for speech-to-text Transcription
+    @classmethod
+    def get_generation_prompt(
+        cls,
+        audio: np.ndarray,
+        model_config: ModelConfig,
+        stt_config: SpeechToTextConfig,
+        language: str | None,
+        task_type: Literal["transcribe", "translate"],
+        request_prompt: str,
+        to_language: str | None,
+    ) -> PromptType:
+        """Get the generation prompt to be used for transcription requests."""
+        # Audio placeholders don't use an index, so value doesn't matter
+        audio_tok = cls.get_placeholder_str("audio", 0)
+
+        if task_type == "translate":
+            full_lang_name_to = cls.supported_languages.get(to_language, to_language)
+            user_prompt = f"{audio_tok}translate the speech to {full_lang_name_to}"  # noqa: E501
+        elif task_type == "transcribe":
+            user_prompt = (
+                f"{audio_tok}can you transcribe the speech into a written format?"  # noqa: E501
+            )
+        else:
+            raise ValueError(f"Unsupported task type {task_type}")
+
+        tokenizer = cached_get_tokenizer(model_config.model)
+        chat = [dict(role="user", content=user_prompt)]
+        prompt = tokenizer.apply_chat_template(
+            chat,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        prompt_token_ids = tokenizer.encode(prompt)
+        prompt = {
+            "prompt_token_ids": prompt_token_ids,
+            "multi_modal_data": {"audio": audio},
+        }
+        return cast(PromptType, prompt)
+
+    # Adapted from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/models/granite_speech/feature_extraction_granite_speech.py#L122 # noqa: E501
+    @classmethod
+    def get_num_audio_tokens(
+        cls,
+        audio_duration_s: float,
+        stt_config: SpeechToTextConfig,
+        model_config: ModelConfig,
+    ) -> int | None:
+        """Get the number of audio tokens for an audio duration in sec."""
+        processor = cached_get_processor(model_config.model)
+        hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
+        proj_win_size = processor.audio_processor.projector_window_size
+        ds_rate = processor.audio_processor.projector_downsample_rate
+        effective_window_size = proj_win_size // ds_rate
+
+        raw_length = audio_duration_s * stt_config.sample_rate
+
+        # mel sequence length computation
+        mel_length = raw_length // hop_length + 1
+        # encoder frame takes two mel features
+        encoder_length = mel_length // 2
+        nblocks = math.ceil(encoder_length / proj_win_size)
+        # projector output length
+        return nblocks * effective_window_size
+
+    @classmethod
+    def get_speech_to_text_config(
+        cls, model_config: ModelConfig, task_type: str
+    ) -> SpeechToTextConfig:
+        """Get the stt config for this model."""
+        # Default settings are reasonable for this model and we don't currently
+        # expose this information in the model configs, but this may change in
+        # the future
+        return SpeechToTextConfig()
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 901f29310872b..8fa9776bd0186 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -374,7 +374,7 @@ class HunYuanSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -1007,7 +1007,7 @@ class HunYuanMoEV1Base(HunyuanV1ModelBase, MixtureOfExperts):
         # Set MoE hyperparameters
         self.expert_weights = []
         self.num_expert_groups = 1
-        self.moe_layers: list[SharedFusedMoE] = []
+        self.moe_layers = []
         example_layer = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -1028,22 +1028,6 @@ class HunYuanMoEV1Base(HunyuanV1ModelBase, MixtureOfExperts):
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            self.expert_weights.append(layer.get_expert_weights())
-            # Register the expert weights.
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index e133206c27a8b..d6a8f86d998bb 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable, Iterable, Mapping, MutableSequence
+from collections.abc import Callable, Iterable, Mapping, MutableSequence, Set
 from typing import (
     TYPE_CHECKING,
     ClassVar,
@@ -14,6 +14,7 @@ from typing import (
 
 import numpy as np
 import torch
+import torch.nn as nn
 from torch import Tensor
 from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
@@ -80,6 +81,11 @@ class SupportsMultiModal(Protocol):
     `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
     """
 
+    multimodal_cpu_fields: ClassVar[Set[str]] = frozenset()
+    """
+    A set indicating CPU-only multimodal fields.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
@@ -641,6 +647,9 @@ class MixtureOfExperts(Protocol):
     num_redundant_experts: int
     """Number of redundant experts in this model."""
 
+    moe_layers: Iterable[nn.Module]
+    """List of MoE layers in this model."""
+
     def set_eplb_state(
         self,
         expert_load_view: Tensor,
@@ -663,7 +672,15 @@ class MixtureOfExperts(Protocol):
             logical_to_physical_map: Mapping from logical to physical experts.
             logical_replica_count: Count of replicas for each logical expert.
         """
-        ...
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
 
     def update_physical_experts_metadata(
         self,
@@ -978,8 +995,6 @@ class SupportsMRoPE(Protocol):
         image_grid_thw: list[list[int]] | torch.Tensor | None,
         video_grid_thw: list[list[int]] | torch.Tensor | None,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -995,8 +1010,6 @@ class SupportsMRoPE(Protocol):
             image_grid_thw: Image grid dimensions (t, h, w)
             video_grid_thw: Video grid dimensions (t, h, w)
             second_per_grid_ts: Seconds per grid timestep for videos
-            context_len: Context length
-            seq_len: Sequence length
             audio_feature_lengths: Audio feature lengths for multimodal models
             use_audio_in_video: Whether to use audio in video for interleaving
 
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 1daaed80b1440..782ab6f1e2da2 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -107,12 +107,14 @@ class JAISAttention(nn.Module):
             total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
 
         tp_rank = get_tensor_model_parallel_rank()
@@ -147,6 +149,7 @@ class JAISMLP(nn.Module):
         intermediate_size: int,
         config: JAISConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -156,6 +159,7 @@ class JAISMLP(nn.Module):
             intermediate_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_fc2 = (
             ColumnParallelLinear(
@@ -163,6 +167,7 @@ class JAISMLP(nn.Module):
                 intermediate_size,
                 bias=True,
                 quant_config=quant_config,
+                prefix=f"{prefix}.c_fc2",
             )
             if self.swiglu
             else None
@@ -172,6 +177,7 @@ class JAISMLP(nn.Module):
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
 
         self.act = SwiGLUActivation()
@@ -206,7 +212,7 @@ class JAISBlock(nn.Module):
             config, cache_config, quant_config, prefix=f"{prefix}.attn"
         )
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = JAISMLP(inner_dim, config, quant_config)
+        self.mlp = JAISMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index f8a87cf6965f8..0cb993901fd38 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -38,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaMLP as JambaMLP
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .interfaces import (
+    HasInnerState,
+    IsHybrid,
+    SupportsLoRA,
+    SupportsMambaPrefixCaching,
+    SupportsPP,
+)
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
@@ -214,12 +220,14 @@ class JambaAttentionDecoderLayer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.attn = Attention(
@@ -454,7 +462,14 @@ class JambaModel(nn.Module):
         return loaded_params
 
 
-class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid):
+class JambaForCausalLM(
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    IsHybrid,
+    SupportsMambaPrefixCaching,
+):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={".self_attn.": ".", ".A_log": ".A"},
     )
@@ -477,12 +492,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHyb
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, (
-            "Jamba currently does not support prefix caching"
-        )
 
         super().__init__()
         self.config = config
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 5f8659a3064eb..42f16ad9f3b3a 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1630,8 +1630,6 @@ class KeyeForConditionalGeneration(
         hf_config: PretrainedConfig,
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
         second_per_grid_ts: list[float] | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
@@ -1759,6 +1757,5 @@ class KeyeForConditionalGeneration(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 13e5b2d5f1575..6f95a59d36d29 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -600,8 +600,6 @@ class KeyeVL1_5ForConditionalGeneration(
         hf_config: PretrainedConfig,
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
         second_per_grid_ts: list[float] | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
@@ -729,6 +727,5 @@ class KeyeVL1_5ForConditionalGeneration(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index c2630fa6ac2b6..b54f53931d714 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -50,7 +50,7 @@ from typing import Annotated, Any, Literal
 
 import torch
 from torch import nn
-from transformers import BatchFeature
+from transformers import BatchFeature, DeepseekV2Config
 from transformers.activations import GELUActivation
 
 from vllm.config import VllmConfig
@@ -91,7 +91,6 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index bb7926a9cfa9d..02a490e9c7fd9 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -105,7 +105,7 @@ class Lfm2MoeSparseMoeBlock(nn.Module):
         self.routed_scaling_factor = config.routed_scaling_factor
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -707,7 +707,7 @@ class Lfm2MoeForCausalLM(
         # Set MoE hyperparameters
         self.expert_weights = []
 
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_layer = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -737,22 +737,6 @@ class Lfm2MoeForCausalLM(
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 7cc908e52c887..0a08bd376badc 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -160,6 +160,14 @@ class LlamaAttention(nn.Module):
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
+        llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
+        self.do_llama_4_scaling = llama_4_scaling_config is not None
+        if self.do_llama_4_scaling:
+            self.llama_4_scaling_original_max_position_embeddings = (
+                llama_4_scaling_config["original_max_position_embeddings"]
+            )
+            self.llama_4_scaling_beta = llama_4_scaling_config["beta"]
+
         self.qkv_proj = QKVParallelLinear(
             hidden_size=hidden_size,
             head_size=self.head_dim,
@@ -221,6 +229,17 @@ class LlamaAttention(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
+    def _get_llama_4_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        # Llama4 scaling
+        scaling = 1 + self.llama_4_scaling_beta * torch.log(
+            1
+            + torch.floor(
+                positions / self.llama_4_scaling_original_max_position_embeddings
+            )
+        )
+        # Broadcast over head_dim
+        return scaling.unsqueeze(-1)
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -229,6 +248,9 @@ class LlamaAttention(nn.Module):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
+        if self.do_llama_4_scaling:
+            attn_scale = self._get_llama_4_attn_scale(positions)
+            q = (q * attn_scale).to(q.dtype)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 33badb13fc9fb..a7e0732ec71e2 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -30,9 +30,11 @@ from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
+    get_ep_group,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -46,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
+from vllm.model_executor.models.interfaces import MixtureOfExperts
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
@@ -56,6 +59,8 @@ from .utils import (
     is_pp_missing_parameter,
 )
 
+logger = init_logger(__name__)
+
 
 class Llama4MoE(nn.Module):
     @staticmethod
@@ -80,6 +85,9 @@ class Llama4MoE(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.top_k = config.num_experts_per_tok
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
 
         intermediate_size_moe = config.intermediate_size
         self.router = ReplicatedLinear(
@@ -101,6 +109,20 @@ class Llama4MoE(nn.Module):
             disable_tp=self.is_sequence_parallel,
         )
 
+        # Load balancing settings.
+        eplb_config = parallel_config.eplb_config if parallel_config else None
+        self.enable_eplb = parallel_config.enable_eplb if parallel_config else False
+        self.n_redundant_experts = (
+            eplb_config.num_redundant_experts if eplb_config else 0
+        )
+
+        self.n_routed_experts: int = config.num_local_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_shared_experts: int = 1
+        self.n_local_experts: int = config.num_local_experts
+        self.n_physical_experts = self.n_local_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
         self.experts = SharedFusedMoE(
             shared_experts=self.shared_expert,
             num_experts=config.num_local_experts,
@@ -114,6 +136,8 @@ class Llama4MoE(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             is_sequence_parallel=self.is_sequence_parallel,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
         )
 
     def forward(self, hidden_states):
@@ -378,6 +402,9 @@ class Llama4Model(LlamaModel):
         layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer,
     ):
         self.num_experts = vllm_config.model_config.hf_config.num_local_experts
+        self.n_redundant_experts = (
+            vllm_config.parallel_config.eplb_config.num_redundant_experts
+        )
         super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
 
     def load_moe_expert_weights(
@@ -499,7 +526,6 @@ class Llama4Model(LlamaModel):
                 shard_id=shard_id,
                 expert_id=expert_id,
             )
-
             loaded_params.add(full_param_name)
             expert_param_loaded = True
 
@@ -526,6 +552,7 @@ class Llama4Model(LlamaModel):
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.num_experts,
+            num_redundant_experts=self.n_redundant_experts,
         )
         # Expert parameter mapping for the case where the expert weights are
         # fused into a single weight tensor.
@@ -683,7 +710,7 @@ class Llama4Model(LlamaModel):
         return loaded_params
 
 
-class Llama4ForCausalLM(LlamaForCausalLM):
+class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -702,6 +729,57 @@ class Llama4ForCausalLM(LlamaForCausalLM):
         super().__init__(
             vllm_config=vllm_config, prefix=prefix, layer_type=Llama4DecoderLayer
         )
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            assert isinstance(layer, Llama4DecoderLayer)
+            if isinstance(layer.feed_forward, Llama4MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.feed_forward
+                self.moe_layers.append(layer.feed_forward.experts)
+
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("No Llama4MoE layer found in model.layers.")
+        else:
+            self.num_moe_layers = len(self.moe_layers)
+            self.num_expert_groups = 1
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.feed_forward, Llama4MoE):
+                moe = layer.feed_forward
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
 
     def _init_model(
         self,
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 90273463d64ed..b59176191e7aa 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -189,6 +189,9 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
             self.config.vocab_size, scale=logit_scale
         )
 
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
+
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index fb145289fbfe9..f684203f6d35e 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     HasInnerState,
     IsAttentionFree,
+    SupportsMambaPrefixCaching,
     SupportsPP,
 )
 from vllm.sequence import IntermediateTensors
@@ -193,15 +194,13 @@ class MambaModel(nn.Module):
         return loaded_params
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
+class MambaForCausalLM(
+    nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching
+):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         lora_config = vllm_config.lora_config
         self.scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, (
-            "Mamba does not support prefix caching"
-        )
 
         super().__init__()
         self.config = config
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 09328b4722488..85d3542317a1d 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -191,13 +191,22 @@ class MiniCPMMLP(nn.Module):
         hidden_act: str,
         hidden_act_param: float,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if hidden_act == "silu":
             self.act_fn = SiluAndMul()
@@ -259,12 +268,14 @@ class MiniCPMAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -578,6 +589,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        parallel_config = vllm_config.parallel_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
@@ -613,6 +625,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
+        if parallel_config.enable_eplb and getattr(config, "num_experts", 0) > 0:
+            raise NotImplementedError("EPLB is not supported for MiniCPM yet.")
 
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index ab4fe36476b92..d3b6966ee3a7f 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -96,6 +96,7 @@ class MiniCPM3Attention(nn.Module):
             self.num_heads * self.qk_head_dim,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.q_b_proj",
         )
 
         self.kv_a_proj_with_mqa = ReplicatedLinear(
@@ -103,6 +104,7 @@ class MiniCPM3Attention(nn.Module):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
@@ -110,6 +112,7 @@ class MiniCPM3Attention(nn.Module):
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
         )
         # O projection.
         self.o_proj = RowParallelLinear(
@@ -117,6 +120,7 @@ class MiniCPM3Attention(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index bc56481820a92..c1f411b6cd2ac 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -98,7 +98,7 @@ class MixtralMoE(nn.Module):
         self.hidden_size = hidden_size
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
 
         # Expert Parallelism Load balancing settings.
@@ -546,7 +546,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         )
 
         self.expert_weights = []
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_moe = None
 
         for layer in self.model.layers:
@@ -572,22 +572,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         self.num_expert_groups = 1
         self.num_shared_experts = 0
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 81be1135dfd9b..4548abde77d5f 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -65,6 +65,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
+    MixtureOfExperts,
     MultiModalEmbeddings,
     SupportsEagle3,
     SupportsMultiModal,
@@ -723,7 +724,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
 class Llama4ForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsPP, SupportsEagle3
+    nn.Module, SupportsMultiModal, SupportsPP, MixtureOfExperts, SupportsEagle3
 ):
     merge_by_field_config = True
 
@@ -776,6 +777,17 @@ class Llama4ForConditionalGeneration(
             self.language_model.make_empty_intermediate_tensors
         )
 
+        # Set MoE hyperparameters
+        self.num_expert_groups = 1
+        self.num_logical_experts = self.language_model.num_logical_experts
+        self.num_physical_experts = self.language_model.num_physical_experts
+        self.num_local_physical_experts = self.language_model.num_local_physical_experts
+        self.num_routed_experts = self.language_model.num_routed_experts
+        self.num_shared_experts = self.language_model.num_shared_experts
+        self.num_redundant_experts = self.language_model.num_redundant_experts
+        self.moe_layers = self.language_model.moe_layers
+        self.num_moe_layers = len(self.moe_layers)
+
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         """Set which layers should output auxiliary hidden states for EAGLE3."""
         # Delegate to underlying language model (Llama4ForCausalLM)
@@ -792,6 +804,24 @@ class Llama4ForConditionalGeneration(
         assert hasattr(self.language_model, "get_eagle3_aux_hidden_state_layers")
         return self.language_model.get_eagle3_aux_hidden_state_layers()
 
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ):
+        self.language_model.set_eplb_state(
+            expert_load_view, logical_to_physical_map, logical_replica_count
+        )
+        self.expert_weights = self.language_model.expert_weights
+
+    def update_physical_experts_metadata(
+        self, num_physical_experts: int, num_local_physical_experts: int
+    ):
+        self.language_model.update_physical_experts_metadata(
+            num_physical_experts, num_local_physical_experts
+        )
+
     def _parse_and_validate_image_input(
         self, **kwargs: object
     ) -> Llama4ImagePatchInputs | None:
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 936dbf6c3243e..29e887c4d9c98 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -83,6 +83,7 @@ class MPTAttention(nn.Module):
             self.total_num_kv_heads,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.Wqkv",
         )
         if self.qk_ln:
             self.q_ln = nn.LayerNorm(self.d_model)
@@ -92,6 +93,7 @@ class MPTAttention(nn.Module):
             self.d_model,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
 
         tp_world_size = get_tensor_model_parallel_world_size()
@@ -152,6 +154,7 @@ class MPTMLP(nn.Module):
         self,
         config: MptConfig,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.d_model
@@ -162,6 +165,7 @@ class MPTMLP(nn.Module):
             intermediate_size,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
         )
         self.act = get_act_fn("gelu")
         self.down_proj = RowParallelLinear(
@@ -169,6 +173,7 @@ class MPTMLP(nn.Module):
             hidden_size,
             bias=not config.no_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -193,7 +198,7 @@ class MPTBlock(nn.Module):
             config, cache_config, quant_config, prefix=f"{prefix}.attn"
         )
         self.norm_2 = nn.LayerNorm(hidden_size)
-        self.ffn = MPTMLP(config, quant_config)
+        self.ffn = MPTMLP(config, quant_config, prefix=f"{prefix}.ffn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 457d3910d0e57..fb58d01be7ba1 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -20,6 +20,7 @@
 
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 
 import torch
 from torch import nn
@@ -549,7 +550,7 @@ class NemotronHModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             len(config.hybrid_override_pattern), get_layer, prefix=f"{prefix}.layers"
         )
-        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size
         )
 
@@ -564,7 +565,7 @@ class NemotronHModel(nn.Module):
         positions: torch.Tensor,
         intermediate_tensors: IntermediateTensors | None = None,
         inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -576,8 +577,7 @@ class NemotronHModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        residual = None
-        for i, layer in enumerate(self.layers):
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -633,6 +633,9 @@ class NemotronHModel(nn.Module):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -678,6 +681,9 @@ class NemotronHModel(nn.Module):
                     if is_expert_weight:
                         continue
 
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
                     param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
@@ -792,14 +798,16 @@ class NemotronHForCausalLM(
             self.unpadded_vocab_size, config.vocab_size
         )
 
-        self.make_empty_intmd_tensors = self.model.make_empty_intmd_tensors
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
 
         # Set MoE hyperparameters
         if self.model.has_moe:
             self.expert_weights = []
             self.num_expert_groups = config.n_group
 
-            self.moe_layers: list[SharedFusedMoE] = []
+            self.moe_layers = []
             example_moe = None
             for layer in self.model.layers:
                 if isinstance(layer, NemotronHMoEDecoderLayer):
@@ -816,22 +824,6 @@ class NemotronHForCausalLM(
             self.num_shared_experts = example_moe.n_shared_experts
             self.num_redundant_experts = example_moe.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 7f867244330fa..35a09334a1293 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -158,6 +158,7 @@ class OlmoeAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.tp_size = tp_size
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -168,6 +169,7 @@ class OlmoeAttention(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
new file mode 100644
index 0000000000000..bf1b7570a8828
--- /dev/null
+++ b/vllm/model_executor/models/openpangu.py
@@ -0,0 +1,1062 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+from collections.abc import Callable, Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_gather,
+)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import (
+    MixtureOfExperts,
+    SupportsLoRA,
+    SupportsPP,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+    sequence_parallel_chunk,
+)
+from vllm.sequence import IntermediateTensors
+
+
+def check_ffn_act_fn(act_fn: str):
+    if act_fn != "silu":
+        raise ValueError(
+            f"Unsupported activation: {act_fn}. Only silu is supported for now."
+        )
+
+
+class OpenPanguMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        reduce_results: bool = True,
+        is_sequence_parallel=False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            disable_tp=is_sequence_parallel,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            disable_tp=is_sequence_parallel,
+            prefix=f"{prefix}.down_proj",
+        )
+
+        check_ffn_act_fn(hidden_act)
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act_fn(self.gate_up_proj(x)[0]))[0]
+
+
+class OpenPanguMoE(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        parallel_config: ParallelConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tp_group().rank_in_group
+
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        check_ffn_act_fn(config.hidden_act)
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.e_score_correction_bias = None
+
+        # Load balancing settings.
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = OpenPanguMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                is_sequence_parallel=self.is_sequence_parallel,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=1,
+            topk_group=1,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        router_logits, _ = self.gate(hidden_states)
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        shared_output, final_hidden_states = fused_moe_out
+        if self.shared_experts is None:
+            assert shared_output is None
+
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= 1.0 / self.routed_scaling_factor
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class OpenPanguMLAAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.rope_theta = rope_theta
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if num_heads % self.tp_size != 0:
+            raise ValueError(
+                f"num_heads {num_heads} is not divisible by tp_size {self.tp_size}."
+            )
+        self.num_local_heads = num_heads // self.tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.prefix = prefix
+
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True,
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa",
+            )
+
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # TODO: remove hard coding
+        rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 1,
+            "mscale": 1.0,
+            "mscale_all_dim": 1.0,
+            "original_max_position_embeddings": max_position_embeddings,
+            "type": "yarn",
+            "rope_type": "deepseek_yarn",
+        }
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
+
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None
+            else None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
+            if self.q_lora_rank is None
+            else None,
+            q_a_layernorm=self.q_a_layernorm if self.q_lora_rank is not None else None,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else None,
+            indexer=None,
+            is_sparse=False,
+            topk_indices_buffer=None,
+        )
+
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.mla_attn(positions, hidden_states)
+
+
+class OpenPanguEmbeddedAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 8192,
+        quant_config: QuantizationConfig | None = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: CacheConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        if self.total_num_heads % tp_size != 0:
+            raise ValueError(
+                f"total_num_heads {self.total_num_heads} "
+                f"is not divisible by tp_size {tp_size}."
+            )
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads > tp_size and self.total_num_kv_heads % tp_size != 0:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                "Number of KV heads is greater than TP size, "
+                f"but total_num_kv_heads {self.total_num_kv_heads} "
+                f"is not divisible by tp_size {tp_size}."
+            )
+        elif (
+            self.total_num_kv_heads < tp_size and tp_size % self.total_num_kv_heads != 0
+        ):
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel ranks.
+            raise ValueError(
+                f"Number of KV heads is less than TP size, but tp_size {tp_size} "
+                f"is not divisible by total_num_kv_heads {self.total_num_kv_heads}."
+            )
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(
+            config, rope_scaling=rope_scaling, quant_config=quant_config
+        )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} "
+                    "for interleaved_sliding_window is not supported."
+                )
+        else:
+            sliding_window = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(
+        self,
+        config: PretrainedConfig,
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
+    ) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "PanguEmbedded":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+
+
+class OpenPanguDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        vllm_config: VllmConfig,
+    ) -> None:
+        super().__init__()
+
+        if config is None:
+            config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        layer_idx = int(prefix.split(sep=".")[-1])
+        self.layer_idx = layer_idx
+
+        self.use_mla = (
+            hasattr(config, "qk_nope_head_dim")
+            and hasattr(config, "qk_rope_head_dim")
+            and hasattr(config, "v_head_dim")
+            and hasattr(config, "kv_lora_rank")
+        )
+        if self.use_mla:
+            self.self_attn = OpenPanguMLAAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=(
+                    config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                ),
+                kv_lora_rank=config.kv_lora_rank,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            attention_bias = getattr(config, "attention_bias", False) or getattr(
+                config, "bias", False
+            )
+            bias_o_proj = attention_bias
+            if hasattr(config, "qkv_bias"):
+                attention_bias = config.qkv_bias
+            # By default, PanguEmbedded uses causal attention
+            # as it is a decoder-only model.
+            # You can override the HF config with `is_causal=False` to enable
+            # bidirectional attention, which is used in some embedding models
+            if getattr(config, "is_causal", True):
+                attn_type = AttentionType.DECODER
+            else:
+                attn_type = AttentionType.ENCODER_ONLY
+            self.self_attn = OpenPanguEmbeddedAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(
+                    config, "num_key_value_heads", config.num_attention_heads
+                ),
+                rope_theta=rope_theta,
+                rope_scaling=getattr(config, "rope_scaling", None),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                attn_type=attn_type,
+            )
+
+        if (
+            getattr(config, "n_routed_experts", None) is not None
+            and layer_idx >= config.first_k_dense_replace
+        ):
+            self.mlp = OpenPanguMoE(
+                config=config,
+                parallel_config=parallel_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = OpenPanguMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.first_k_dense_replace = getattr(
+            config, "first_k_dense_replace", self.num_hidden_layers
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.tp_group = get_tp_group().device_group
+        self.sandwich_norm = getattr(config, "sandwich_norm", False)
+        if self.sandwich_norm:
+            self.pre_mlp_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+            self.post_mlp_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states.clone()
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        if (
+            self.routed_scaling_factor is not None
+            and hidden_states.dtype == torch.float16
+        ):
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1.0 / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1.0 / self.routed_scaling_factor
+
+        if self.sandwich_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, residual = self.pre_mlp_layernorm(hidden_states, residual)
+        else:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        if (
+            self.routed_scaling_factor is not None
+            and isinstance(self.mlp, OpenPanguMLP)
+            and hidden_states.dtype == torch.float16
+        ):
+            hidden_states *= 1.0 / self.routed_scaling_factor
+
+        if self.sandwich_norm:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class OpenPanguModel(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.config = config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (
+            config.tie_word_embeddings and get_pp_group().is_last_rank
+        ):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OpenPanguDecoderLayer(config, prefix, vllm_config),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_attn_mlp_weight(
+        self,
+        attn_mlp_replace_mapping: list[tuple[str, str, int]],
+        params_dict: dict[str, Any],
+        weight_name: str,
+        loaded_weight: torch.Tensor,
+        loaded_params: set[str],
+    ) -> bool:
+        for param_name, origin_name, shard_id in attn_mlp_replace_mapping:
+            if origin_name not in weight_name or (
+                ("mlp.experts." in weight_name) and weight_name not in params_dict
+            ):
+                continue
+            weight_name_mapped = weight_name.replace(origin_name, param_name)
+            if (
+                param_name == "fused_qkv_a_proj"
+                and weight_name_mapped not in params_dict
+            ):
+                continue
+            else:
+                weight_name = weight_name_mapped
+            if weight_name.endswith(".bias") and weight_name not in params_dict:
+                continue
+            if is_pp_missing_parameter(weight_name, self):
+                continue
+
+            param = params_dict[weight_name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            loaded_params.add(weight_name)
+            return True
+        return False
+
+    def load_expert_weight(
+        self,
+        expert_merge_mapping: list[tuple[str, str, int, str]],
+        params_dict: dict[str, Any],
+        weight_name: str,
+        loaded_weight: torch.Tensor,
+        loaded_params: set[str],
+        flag_dict: dict[str, bool],
+    ) -> bool:
+        for mapping in expert_merge_mapping:
+            param_name, origin_name, expert_id, shard_id = mapping
+            if origin_name not in weight_name:
+                continue
+            flag_dict["is_expert_weight"] = True
+            weight_name_mapped = weight_name.replace(origin_name, param_name)
+            if is_pp_missing_parameter(weight_name_mapped, self):
+                continue
+            param = params_dict[weight_name_mapped]
+            weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+            success = weight_loader(
+                param,
+                loaded_weight,
+                weight_name_mapped,
+                shard_id=shard_id,
+                expert_id=expert_id,
+                return_success=True,
+            )
+            if success:
+                weight_name = weight_name_mapped
+                loaded_params.add(weight_name_mapped)
+                return True
+        return False
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        attn_mlp_replace_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".fused_qkv_a_proj", ".q_a_proj", 0),
+            (".fused_qkv_a_proj", ".kv_a_proj_with_mqa", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        has_experts = hasattr(self.config, "n_routed_experts")
+        if has_experts:
+            expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.n_routed_experts,
+                num_redundant_experts=self.num_redundant_experts,
+            )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            if (
+                "layers" in name
+                and hasattr(self.config, "num_nextn_predict_layers")
+                and (self.config.num_nextn_predict_layers > 0)
+            ):
+                layer_idx = int(name.split("layers.")[-1].split(".")[0])
+                mtp_idx = layer_idx - self.config.num_hidden_layers
+                if mtp_idx >= 0 and mtp_idx < self.config.num_nextn_predict_layers:
+                    continue  # skip spec decode layers for main model
+
+            flag_dict = {"is_expert_weight": False}
+            if (
+                self.load_attn_mlp_weight(
+                    attn_mlp_replace_mapping,
+                    params_dict,
+                    name,
+                    loaded_weight,
+                    loaded_params,
+                )
+                or has_experts
+                and self.load_expert_weight(
+                    expert_merge_mapping,
+                    params_dict,
+                    name,
+                    loaded_weight,
+                    loaded_params,
+                    flag_dict,
+                )
+            ):
+                continue
+            else:
+                if flag_dict["is_expert_weight"]:
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
+
+
+class OpenPanguModelBase(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.model = OpenPanguModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
+
+
+class OpenPanguMoEModel(OpenPanguModelBase, MixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+
+        # Set MoE hyperparameters
+        self.expert_weights = []
+        self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace
+        self.num_expert_groups = 1
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, OpenPanguDecoderLayer)
+            if isinstance(layer.mlp, OpenPanguMoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No MOE layer found in model.layers.")
+
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.n_routed_experts = example_moe.n_routed_experts
+        self.n_shared_experts = example_moe.n_shared_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, OpenPanguMoE):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+
+class OpenPanguEmbeddedModel(OpenPanguModelBase):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+
+class PanguEmbeddedForCausalLM(OpenPanguEmbeddedModel):
+    pass
+
+
+class PanguUltraMoEForCausalLM(OpenPanguMoEModel):
+    pass
diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py
new file mode 100644
index 0000000000000..f4049f2d39705
--- /dev/null
+++ b/vllm/model_executor/models/openpangu_mtp.py
@@ -0,0 +1,265 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.deepseek_mtp import (
+    DeepSeekMultiTokenPredictor,
+    DeepSeekMultiTokenPredictorLayer,
+    SharedHead,
+)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .openpangu import OpenPanguDecoderLayer
+
+
+class OpenPanguMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
+    def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
+        nn.Module.__init__(self)
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        self.config = config
+        quant_config = vllm_config.quant_config
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+        self.shared_head = SharedHead(
+            config=config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "shared_head"),
+        )
+        self.mtp_block = OpenPanguDecoderLayer(config, prefix, vllm_config)
+
+
+class OpenPanguMultiTokenPredictor(DeepSeekMultiTokenPredictor):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict(
+            {
+                str(idx): OpenPanguMultiTokenPredictorLayer(
+                    vllm_config, f"{prefix}.layers.{idx}"
+                )
+                for idx in range(
+                    self.mtp_start_layer_idx,
+                    self.mtp_start_layer_idx + self.num_mtp_layers,
+                )
+            }
+        )
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+
+@support_torch_compile
+class OpenPanguMTP(nn.Module, SupportsPP):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = OpenPanguMultiTokenPredictor(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            hidden_states,
+            inputs_embeds,
+            spec_step_idx,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        return self.model.compute_logits(hidden_states, spec_step_idx)
+
+    def get_spec_layer(self, name):
+        if (
+            "layers" in name
+            and hasattr(self.config, "num_nextn_predict_layers")
+            and self.config.num_nextn_predict_layers > 0
+        ):
+            layer_idx = int(name.split("layers.")[-1].split(".")[0])
+            mtp_idx = layer_idx - self.config.num_hidden_layers
+            if mtp_idx >= 0 and mtp_idx < self.config.num_nextn_predict_layers:
+                return layer_idx
+        return None
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = self.get_spec_layer(name)
+            if spec_layer is None:
+                continue
+
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
+                    continue
+                else:
+                    name = name_mapped
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if (
+                        spec_layer != self.model.mtp_start_layer_idx
+                        and ".layers" not in name
+                    ):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        and rename shared layer weights to be top level.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
+        ]
+        shared_weight_names = ["embed_tokens"]
+        spec_layer_weight = False
+        shared_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                if weight_name in shared_weight_names:
+                    shared_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
+        elif shared_weight:
+            # treat shared weights as top level weights
+            name = name.replace(f"model.layers.{spec_layer}.", "model.")
+        return name
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index cfe4d03334182..cbfce18b43885 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -52,13 +52,22 @@ class OrionMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size, bias=False, quant_config=quant_config
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -116,12 +125,14 @@ class OrionAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -183,6 +194,7 @@ class OrionDecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
 
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
new file mode 100644
index 0000000000000..631475c964c0b
--- /dev/null
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -0,0 +1,1404 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Literal
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature, PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPooling,
+)
+from transformers.utils import torch_int
+
+from vllm.attention.backends.registry import _Backend
+from vllm.attention.layer import (
+    check_upstream_fa_availability,
+    maybe_get_vit_flash_attn_backend,
+)
+from vllm.attention.ops.vit_attn_wrappers import (
+    vit_flash_attn_wrapper,
+    vit_xformers_attn_wrapper,
+)
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.common import (
+    dispatch_rotary_emb_function,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
+from vllm.multimodal.parse import (
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .ernie45 import Ernie4_5ForCausalLM
+from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    is_pp_missing_parameter,
+    maybe_prefix,
+)
+from .vision import get_vit_attn_backend
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+
+    if height < factor:
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, "
+            f"got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+
+
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    sin = repeat(
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = rotary_emb_function(t_, cos, sin).type_as(t)
+    return output
+
+
+class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self):
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor,
+    ) -> int:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        do_resize = True
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_image_tokens = num_patches // (merge_size**2)
+
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        image_size = hf_config.vision_config.image_size
+        return ImageSize(height=image_size, width=image_size)
+
+
+class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        max_image_size = self.info.get_image_size_with_most_features()
+        image_overrides = mm_options.get("image") if mm_options else None
+
+        return {
+            "image": self._get_dummy_images(
+                width=max_image_size.width,
+                height=max_image_size.height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
+        }
+
+
+class PaddleOCRVLMultiModalProcessor(
+    BaseMultiModalProcessor[PaddleOCRVLProcessingInfo]
+):
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(text=prompt, **mm_data),
+                dict(**mm_kwargs, **tok_kwargs),
+            )
+            num_patches_per_image = processed_outputs["image_grid_thw"].prod(-1)
+            processed_outputs["pixel_values"] = processed_outputs["pixel_values"].split(
+                num_patches_per_image.tolist()
+            )
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(
+                prompt, add_special_tokens=True, return_tensors="pt"
+            )
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_id
+
+        def get_replacement(item_idx: int, image_processor):
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            num_image_tokens = self.info.get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                image_processor=image_processor,
+            )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=partial(get_replacement, image_processor=image_processor),
+            ),
+        ]
+
+
+class Projector(nn.Module):
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size, eps=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(
+            self.hidden_size, self.text_config.hidden_size, bias=True
+        )
+
+    def forward(
+        self,
+        image_features: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)
+                t, h, w = image_grid
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+class PaddleOCRImagePixelInputs(TensorSchema):
+    type: Literal["pixel_values"]
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "p", 3, "patch_size", "patch_size", dynamic_dims={"p"}),
+    ]
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 3),
+    ]
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(
+        self,
+        embeddings: torch.Tensor,
+        height: int,
+        width: int,
+        is_after_patchify: bool = False,
+    ) -> torch.Tensor:
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def fetch_position_embedding_lfu_cache(
+        self, embeddings: torch.Tensor, h: int, w: int, max_cache: int = 20
+    ):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count,
+                key=self.cache_position_count.get,
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        interpolate_pos_encoding=False,
+    ) -> torch.Tensor:
+        if pixel_values.dim() == 4:
+            pixel_values = pixel_values.unsqueeze(0)
+        if pixel_values.dim() == 5:
+            if position_ids is None:
+                raise ValueError(
+                    "position_ids cannot be None when pixel_values.dim() is 5."
+                )
+            (
+                batch_size,
+                squence_len,
+                channel,
+                height,
+                width,
+            ) = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                start = 0
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[start:end, :]
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .repeat(t, 1)
+                    )
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
+            return embeddings
+        else:
+            raise ValueError(
+                "Unsupported pixel_values dimension:"
+                f" {pixel_values.dim()}. Expected 4 or 5."
+            )
+
+
+def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(
+        gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group
+    )
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class SiglipAttention(nn.Module):
+    """SigLIP vision attention adapted from Qwen2.5-VisionAttention."""
+
+    def __init__(
+        self,
+        *,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend_override: _Backend | None = None,
+        use_upstream_fa: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.attn_backend = attn_backend
+        self.use_upstream_fa = use_upstream_fa
+        self.attn_backend, self.flash_attn_varlen_func = (
+            maybe_get_vit_flash_attn_backend(
+                self.attn_backend,
+                self.use_upstream_fa,
+                attn_backend_override=attn_backend_override,
+            )
+        )
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN,
+            _Backend.ROCM_AITER_FA,
+        }
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size)
+
+        q, k, v = qkv.chunk(3, dim=2)
+
+        if self.tp_size > 1:
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor | None,
+        max_seqlen: torch.Tensor | None,
+        seqlens: torch.Tensor | None,
+    ) -> torch.Tensor:
+        batch_size, _, _ = hidden_states.shape
+
+        x = rearrange(hidden_states, "b s d -> s b d")
+        x, _ = self.qkv_proj(x)
+        q, k, v = self.split_qkv(x)
+        q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v))
+
+        if rotary_pos_emb is not None:
+            qk_concat = torch.cat([q, k], dim=0)
+            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            q, k = torch.chunk(qk_rotated, 2, dim=0)
+
+        if self.is_flash_attn_backend:
+            if max_seqlen is None:
+                raise ValueError("Flash attention backend requires max_seqlen.")
+            context_layer = vit_flash_attn_wrapper(
+                q,
+                k,
+                v,
+                cu_seqlens,
+                max_seqlen,
+                batch_size,
+                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.use_upstream_fa,
+            )
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(tensor, "b s h d -> b h s d")
+                    for tensor in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+        elif self.attn_backend == _Backend.XFORMERS:
+            if seqlens is None:
+                raise ValueError("xFormers attention backend requires seqlens tensor.")
+            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
+        else:
+            raise RuntimeError(
+                f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+            )
+
+        output, _ = self.out_proj(context_layer)
+        output = rearrange(output, "s b d -> b s d")
+        return output
+
+
+class SigLIPRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        inv_freq = 1.0 / (
+            self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class SiglipMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+            quantizable = True
+        else:
+            # For other quantization, we require the hidden size to be a
+            # multiple of 64
+            quantizable = (
+                config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+            )
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        *,
+        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend_override: _Backend | None = None,
+        use_upstream_fa: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = SiglipAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attn_backend=attn_backend,
+            attn_backend_override=attn_backend_override,
+            use_upstream_fa=use_upstream_fa,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor | None,
+        max_seqlen: torch.Tensor | None,
+        seqlens: torch.Tensor | None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class SiglipEncoder(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: _Backend | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.attn_backend = get_vit_attn_backend(
+            head_size=head_dim,
+            dtype=torch.get_default_dtype(),
+            attn_backend_override=attn_backend_override,
+        )
+        self.use_upstream_fa = False
+        if self.attn_backend not in {
+            _Backend.FLASH_ATTN,
+            _Backend.ROCM_AITER_FA,
+        } and check_upstream_fa_availability(torch.get_default_dtype()):
+            self.attn_backend = _Backend.FLASH_ATTN
+            self.use_upstream_fa = True
+        if self.attn_backend not in {
+            _Backend.FLASH_ATTN,
+            _Backend.TORCH_SDPA,
+            _Backend.XFORMERS,
+            _Backend.ROCM_AITER_FA,
+        }:
+            raise RuntimeError(
+                f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+            )
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_backend=self.attn_backend,
+                    attn_backend_override=attn_backend_override,
+                    use_upstream_fa=self.use_upstream_fa,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def forward(
+        self,
+        inputs_embeds,
+        cu_seqlens: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        device = inputs_embeds.device
+        hidden_states = inputs_embeds
+
+        flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+        if width_position_ids is None or height_position_ids is None:
+            split_hids = list()
+            split_wids = list()
+            for t, h, w in flatten_image_grid_thw:
+                image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                sample_hids = image_pids // w
+                sample_wids = image_pids % w
+                split_hids.append(sample_hids)
+                split_wids.append(sample_wids)
+            width_position_ids = torch.concat(split_wids, dim=0)
+            height_position_ids = torch.concat(split_hids, dim=0)
+
+        pids = torch.stack(
+            [height_position_ids, width_position_ids],
+            dim=-1,
+        )
+        max_grid_size = pids.max() + 1
+        rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rope_emb_max_grid[pids].flatten(1)
+
+        if cu_seqlens is None:
+            raise ValueError("cu_seqlens cannot be None for SiglipEncoder.")
+        if not isinstance(cu_seqlens, torch.Tensor):
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+        else:
+            cu_seqlens = cu_seqlens.to(device=device)
+
+        max_seqlen = None
+        seqlens = None
+        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+        return hidden_states
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: _Backend | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+            attn_backend_override=attn_backend_override,
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool | None = False,
+        position_ids: torch.Tensor | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        return last_hidden_state
+
+
+class SiglipVisionModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_backend_override: _Backend | None = None,
+    ):
+        super().__init__()
+
+        self.vision_model = SiglipVisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_model",
+            attn_backend_override=attn_backend_override,
+        )
+        self.quant_config = quant_config
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        interpolate_pos_encoding: bool = False,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+    ) -> BaseModelOutputWithPooling:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+            cu_seqlens=cu_seqlens,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "head.attention" in name or "head.layernorm" in name:
+                continue
+            if "head.mlp" in name or "head.probe" in name:
+                continue
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                param = params_dict[scale_name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (
+                param_name,
+                weight_name,
+                shard_id,
+            ) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PaddleOCRVLMultiModalProcessor,
+    info=PaddleOCRVLProcessingInfo,
+    dummy_inputs=PaddleOCRVLDummyInputsBuilder,
+)
+class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
+    merge_by_field_config = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        attn_backend_override = (
+            multimodal_config.mm_encoder_attn_backend
+            if multimodal_config is not None
+            else None
+        )
+
+        self.visual = SiglipVisionModel(
+            config=config.vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+            attn_backend_override=attn_backend_override,
+        )
+        self.mlp_AR = Projector(config, config.vision_config)
+
+        self.language_model = Ernie4_5ForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        for layer in self.language_model.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.is_neox_style = True
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
+        second_per_grid_ts: list[float],
+        audio_feature_lengths: torch.Tensor | None = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                    * video_second_per_grid_t
+                    * tokens_per_second
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+
+        return llm_positions, mrope_position_delta
+
+    def get_language_model(self) -> nn.Module:
+        return self.language_model
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> PaddleOCRImagePixelInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        return PaddleOCRImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ):
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            is_multimodal = kwargs.pop("is_multimodal", None)
+            handle_oov_mm_token = kwargs.pop("handle_oov_mm_token", False)
+            inputs_embeds = self.get_input_embeddings(
+                input_ids,
+                vision_embeddings,
+                is_multimodal=is_multimodal,
+                handle_oov_mm_token=handle_oov_mm_token,
+            )
+            input_ids = None
+
+        return self.language_model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+
+        raise ValueError("Only image modality is supported")
+
+    def encode_image(
+        self, pixel_values: torch.Tensor, image_grid_thw: torch.Tensor
+    ) -> torch.Tensor:
+        pixel_values = pixel_values.type(self.visual.dtype)
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        cu_seqlens = [0]
+
+        thw_tuple = tuple(image_grid_thw.tolist())
+        numel = np.prod(thw_tuple)
+        image_grid_hws.append(thw_tuple)
+        image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+        siglip_position_ids.append(image_position_ids)
+        cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+            pixel_values.device
+        )
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values.device)
+
+        vision_outputs = self.visual(
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_hws,
+            position_ids=siglip_position_ids,
+            interpolate_pos_encoding=True,
+            cu_seqlens=cu_seqlens,
+        )
+        return vision_outputs
+
+    def _process_image_input(
+        self, image_input: PaddleOCRImagePixelInputs
+    ) -> MultiModalEmbeddings:
+        pixel_values = image_input.pixel_values
+        image_grid_thw = image_input.image_grid_thw
+        vision_outputs = tuple(
+            self.encode_image(pixel, grid).squeeze(0)
+            for pixel, grid in zip(pixel_values, image_grid_thw)
+        )
+        image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+        return image_embeds
+
+    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return ()
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+        image_embeds = self._process_image_input(image_input)
+        multimodal_embeddings += tuple(image_embeds)
+
+        return multimodal_embeddings
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 2c62f6862cf25..37a7108d5c013 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -62,14 +62,23 @@ from .utils import (
 
 class PersimmonMLP(nn.Module):
     def __init__(
-        self, config: PersimmonConfig, quant_config: QuantizationConfig | None = None
+        self,
+        config: PersimmonConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.dense_h_to_4h = ColumnParallelLinear(
-            config.hidden_size, config.intermediate_size, quant_config=quant_config
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
         self.dense_4h_to_h = RowParallelLinear(
-            config.intermediate_size, config.hidden_size, quant_config=quant_config
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -110,12 +119,14 @@ class PersimmonAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
         self.is_qk_layernorm = config.qk_layernorm
 
@@ -192,7 +203,11 @@ class PersimmonDecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.mlp = PersimmonMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
         self.input_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layer_norm_eps
         )
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 6adcaf5084cbe..34db124b6447c 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -99,11 +99,13 @@ class PhiAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.dense = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         scaling = self.head_size**-0.5
@@ -148,7 +150,10 @@ class PhiAttention(nn.Module):
 
 class PhiMLP(nn.Module):
     def __init__(
-        self, config: PhiConfig, quant_config: QuantizationConfig | None = None
+        self,
+        config: PhiConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -159,11 +164,13 @@ class PhiMLP(nn.Module):
             config.hidden_size,
             n_inner,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
         )
         self.fc2 = RowParallelLinear(
             n_inner,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -189,7 +196,7 @@ class PhiLayer(nn.Module):
         self.self_attn = PhiAttention(
             config, cache_config, quant_config, prefix=f"{prefix}.self_attn"
         )
-        self.mlp = PhiMLP(config, quant_config)
+        self.mlp = PhiMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 2cd4d8c727216..c7436cedeb229 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -343,12 +343,14 @@ class PhiMoEAttention(nn.Module):
             self.total_num_kv_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 09293f63f70e1..6427ccfccc134 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -567,12 +567,14 @@ class Plamo2AttentionMixer(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 72e66d8f30384..c99f628004fbd 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -102,12 +102,14 @@ class QWenAttention(nn.Module):
             self.total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.scaling = self.head_dim**-0.5
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 677d34dea39b3..fac281d2caf49 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -130,6 +130,8 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
         TensorShape("nmb", "tsl", dynamic_dims={"tsl"}),
     ]
 
+    audio_feature_lengths: Annotated[torch.Tensor, TensorShape("na")]
+
     feature_attention_mask: Annotated[
         torch.Tensor | list[torch.Tensor],
         TensorShape("na", "msl", dynamic_dims={"msl"}),
@@ -732,13 +734,6 @@ class Qwen2_5OmniConditionalGenerationMixin:
         input_features = audio_input["input_features"]
         audio_feature_lengths = audio_input["audio_feature_lengths"]
 
-        if audio_feature_lengths.shape[0] == 1:
-            audio_feature_lengths = audio_feature_lengths.squeeze(0)
-        elif audio_feature_lengths.shape[1] == 1:
-            audio_feature_lengths = audio_feature_lengths.squeeze(1)
-        else:
-            raise AssertionError(audio_feature_lengths.shape)
-
         audio_feat_lengths, audio_output_lengths = (
             self.audio_tower._get_feat_extract_output_lengths(audio_feature_lengths)
         )
@@ -932,8 +927,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -1130,7 +1123,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         mrope_position_delta = (
             torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item)
         )
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 3585783e4ccc3..48834ba699e4c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -46,6 +46,7 @@ from vllm.attention.backends.registry import _Backend
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
+    vit_torch_sdpa_wrapper,
     vit_xformers_attn_wrapper,
 )
 from vllm.compilation.decorators import support_torch_compile
@@ -66,6 +67,7 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.vision import should_torch_compile_mm_vit
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.evs import (
     compute_mrope_for_media,
@@ -442,23 +444,12 @@ class Qwen2_5_VisionAttention(nn.Module):
                 q = q.contiguous()
                 k = k.contiguous()
                 v = v.contiguous()
-            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
-                q_i, k_i, v_i = (
-                    einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
-                )
-                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
-                output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
-                outputs.append(output_i)
-            context_layer = torch.cat(outputs, dim=1)
-            context_layer = einops.rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
+            context_layer = vit_torch_sdpa_wrapper(
+                q,
+                k,
+                v,
+                cu_seqlens,
+            )
         elif self.attn_backend == _Backend.XFORMERS:
             context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
@@ -466,17 +457,16 @@ class Qwen2_5_VisionAttention(nn.Module):
         return output
 
 
-# (FIXME): Enable this after dynamic slicing is fixed
-# See https://github.com/vllm-project/vllm/pull/27760
-# @support_torch_compile(
-#     dynamic_arg_dims={
-#         "x": 0,
-#         "cu_seqlens": 0,
-#         "rotary_pos_emb": 0,
-#         "seqlens": 0,
-#     },
-#     mark_unbacked_dims={"seqlens": 0},
-# )
+@support_torch_compile(
+    dynamic_arg_dims={
+        "x": 0,
+        "cu_seqlens": 0,
+        "rotary_pos_emb": 0,
+        "seqlens": 0,
+    },
+    mark_unbacked_dims={"seqlens": 0},
+    enable_if=should_torch_compile_mm_vit,
+)
 class Qwen2_5_VisionBlock(nn.Module):
     def __init__(
         self,
@@ -541,7 +531,8 @@ class Qwen2_5_VisionBlock(nn.Module):
 @support_torch_compile(
     dynamic_arg_dims={
         "x": 0,
-    }
+    },
+    enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionPatchEmbed(nn.Module):
     def __init__(
@@ -572,7 +563,8 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
 @support_torch_compile(
     dynamic_arg_dims={
         "x": 0,
-    }
+    },
+    enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionPatchMerger(nn.Module):
     def __init__(
@@ -1098,6 +1090,7 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsMRoPE,
 ):
     merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -1125,8 +1118,6 @@ class Qwen2_5_VLForConditionalGeneration(
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
         second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -1239,7 +1230,6 @@ class Qwen2_5_VLForConditionalGeneration(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
@@ -1372,13 +1362,8 @@ class Qwen2_5_VLForConditionalGeneration(
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _postprocess_image_embeds_evs(
@@ -1438,12 +1423,7 @@ class Qwen2_5_VLForConditionalGeneration(
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _postprocess_video_embeds_evs(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1ec12bdb55dfe..b3999e6c934e3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -798,21 +798,27 @@ class Qwen2VisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
         x = self.patch_embed(x)
 
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+
         # compute position embedding
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
-        grid_thw_ = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
         cu_seqlens = torch.repeat_interleave(
-            grid_thw_[:, 1] * grid_thw_[:, 2], grid_thw_[:, 0]
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         # transformers
         x = x.unsqueeze(1)
@@ -1211,6 +1217,7 @@ class Qwen2VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
     merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
@@ -1233,8 +1240,6 @@ class Qwen2VLForConditionalGeneration(
         image_grid_thw: list[list[int]] | torch.Tensor | None,
         video_grid_thw: list[list[int]] | torch.Tensor | None,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -1353,7 +1358,6 @@ class Qwen2VLForConditionalGeneration(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
@@ -1458,7 +1462,6 @@ class Qwen2VLForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"]
@@ -1467,18 +1470,14 @@ class Qwen2VLForConditionalGeneration(
 
             if self.use_data_parallel:
                 return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                 )
             else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
@@ -1486,26 +1485,22 @@ class Qwen2VLForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"]
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
+                grid_thw_list = grid_thw.tolist()
                 return run_dp_sharded_mrope_vision_model(
                     self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
                 )
             else:
-                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
-
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8452d7b04f5c2..d57b82cb02273 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -43,6 +43,7 @@ from vllm.distributed import (
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -132,7 +133,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -171,6 +172,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
         )
 
         self.gate = ReplicatedLinear(
@@ -665,7 +667,7 @@ class Qwen3MoeForCausalLM(
         # Set MoE hyperparameters
         self.expert_weights = []
 
-        self.moe_layers: list[FusedMoE] = []
+        self.moe_layers = []
         example_layer = None
         for layer in self.model.layers:
             if isinstance(layer, PPMissingLayer):
@@ -688,22 +690,6 @@ class Qwen3MoeForCausalLM(
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
     def update_physical_experts_metadata(
         self,
         num_physical_experts: int,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index f452ba871582d..aa7de5aa5f29c 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -30,12 +30,15 @@ from vllm.distributed import (
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fla.ops import (
-    RMSNormGated,
     chunk_gated_delta_rule,
     fused_recurrent_gated_delta_rule,
 )
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
-from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+from vllm.model_executor.layers.layernorm import (
+    GemmaRMSNorm as Qwen3NextRMSNorm,
+)
+from vllm.model_executor.layers.layernorm import RMSNormGated
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -107,7 +110,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
 
         self.ep_group = get_ep_group().device_group
-        self.ep_rank = self.ep_group.rank()
+        self.ep_rank = get_ep_group().rank_in_group
         self.ep_size = self.ep_group.size()
         self.n_routed_experts = config.num_experts
 
@@ -171,6 +174,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -436,17 +440,68 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         hidden_states: torch.Tensor,
         output: torch.Tensor,
     ):
-        return torch.ops.vllm.gdn_attention(
-            hidden_states,
-            output,
+        """
+        Forward pass with three parts:
+        1. Input projection
+        2. Core attention (custom op)
+        3. Output projection
+        """
+        num_tokens = hidden_states.size(0)
+
+        # ============================================================
+        # Part 1: Input Projection
+        # ============================================================
+        projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+        projected_states_ba, _ = self.in_proj_ba(hidden_states)
+        query, key, value, z, b, a = self.fix_query_key_value_ordering(
+            projected_states_qkvz, projected_states_ba
+        )
+        query, key, value = map(
+            lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
+        )
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+
+        # ============================================================
+        # Part 2: Core Attention (Custom Op)
+        # ============================================================
+        # Note: we should not use torch.empty here like other attention backends,
+        # see discussions in https://github.com/vllm-project/vllm/pull/28182
+        core_attn_out = torch.zeros(
+            (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        torch.ops.vllm.gdn_attention_core(
+            mixed_qkv,
+            b,
+            a,
+            core_attn_out,
             self.prefix,
         )
 
-    def _forward(
+        # ============================================================
+        # Part 3: Output Projection
+        # ============================================================
+        z_shape_og = z.shape
+        # Reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
+    def _forward_core(
         self,
-        hidden_states: torch.Tensor,
-        output: torch.Tensor,
+        mixed_qkv: torch.Tensor,
+        b: torch.Tensor,
+        a: torch.Tensor,
+        core_attn_out: torch.Tensor,
     ):
+        """
+        Core attention computation (called by custom op).
+        """
         forward_context = get_forward_context()
         attn_metadata: AttentionMetadata = forward_context.attn_metadata
 
@@ -471,18 +526,11 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         num_actual_tokens = attn_metadata.num_actual_tokens
         num_accepted_tokens = attn_metadata.num_accepted_tokens
 
-        # 1. Set up dimensions for reshapes later
-        projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states[:num_actual_tokens])
-        projected_states_ba, _ = self.in_proj_ba(hidden_states[:num_actual_tokens])
-        query, key, value, z, b, a = self.fix_query_key_value_ordering(
-            projected_states_qkvz, projected_states_ba
-        )
-        query, key, value = map(
-            lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
-        )
-        mixed_qkv = torch.cat((query, key, value), dim=-1)
+        mixed_qkv = mixed_qkv[:num_actual_tokens]
+        b = b[:num_actual_tokens]
+        a = a[:num_actual_tokens]
 
-        # 2. Convolution sequence transformation
+        # 1. Convolution sequence transformation
         conv_weights = self.conv1d.weight.view(
             self.conv1d.weight.size(0), self.conv1d.weight.size(2)
         )
@@ -498,7 +546,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             mixed_qkv_spec = None
             mixed_qkv_non_spec = mixed_qkv
 
-        # 2.1: process the mutli-query part
+        # 1.1: Process the multi-query part
         if spec_sequence_masks is not None:
             mixed_qkv_spec = causal_conv1d_update(
                 mixed_qkv_spec,
@@ -515,7 +563,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                 validate_data=False,
             )
 
-        # 2.2: process the remaining part
+        # 1.2: Process the remaining part
         if attn_metadata.num_prefills > 0:
             mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1)
             # - "cache_indices" updates the conv_state cache in positions
@@ -551,10 +599,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             mixed_qkv_non_spec
         )
 
-        beta = b.sigmoid()
-        # g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-        g = fused_gdn_gating(self.A_log, a, self.dt_bias)
-        g, beta = map(lambda x: rearrange(x, "l d -> 1 l d"), (g, beta))
+        g, beta = fused_gdn_gating(self.A_log, a, b, self.dt_bias)
 
         if spec_sequence_masks is not None:
             if attn_metadata.num_prefills == 0 and attn_metadata.num_decodes == 0:
@@ -573,9 +618,9 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             g_non_spec = g
             beta_non_spec = beta
 
-        # 3. Recurrent attention
+        # 2. Recurrent attention
 
-        # 3.1: process the mutlti-query part
+        # 2.1: Process the multi-query part
         if spec_sequence_masks is not None:
             core_attn_out_spec, last_recurrent_state = fused_recurrent_gated_delta_rule(
                 q=query_spec,
@@ -593,7 +638,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         else:
             core_attn_out_spec, last_recurrent_state = None, None
 
-        # 3.2: process the remaining part
+        # 2.2: Process the remaining part
         if attn_metadata.num_prefills > 0:
             initial_state = ssm_state[non_spec_state_indices_tensor].contiguous()
             initial_state[~has_initial_state, ...] = 0
@@ -636,30 +681,20 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         else:
             core_attn_out_non_spec, last_recurrent_state = None, None
 
-        # Merge core attention output
+        # 3. Merge core attention output
         if spec_sequence_masks is not None and core_attn_out_non_spec is not None:
-            core_attn_out = torch.empty(
+            merged_out = torch.empty(
                 (1, num_actual_tokens, *core_attn_out_spec.shape[2:]),
                 dtype=core_attn_out_non_spec.dtype,
                 device=core_attn_out_non_spec.device,
             )
-            core_attn_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
-            core_attn_out.index_copy_(1, non_spec_token_indx, core_attn_out_non_spec)
-
+            merged_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
+            merged_out.index_copy_(1, non_spec_token_indx, core_attn_out_non_spec)
+            core_attn_out[:num_actual_tokens] = merged_out.squeeze(0)
         elif spec_sequence_masks is not None:
-            core_attn_out = core_attn_out_spec
+            core_attn_out[:num_actual_tokens] = core_attn_out_spec.squeeze(0)
         else:
-            core_attn_out = core_attn_out_non_spec
-
-        z_shape_og = z.shape
-        # reshape input data into 2D tensor
-        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
-        z = z.reshape(-1, z.shape[-1])
-        core_attn_out = self.norm(core_attn_out, z)
-        core_attn_out = core_attn_out.reshape(z_shape_og)
-        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
-
-        output[:num_actual_tokens], _ = self.out_proj(core_attn_out)
+            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)
 
 
 class Qwen3NextAttention(nn.Module):
@@ -1095,8 +1130,57 @@ class Qwen3NextModel(nn.Module):
         return loaded_params
 
 
+class QwenNextMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, Qwen3NextDecoderLayer) and isinstance(
+                layer.mlp, Qwen3NextSparseMoeBlock
+            ):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+            if example_moe is None:
+                raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
 class Qwen3NextForCausalLM(
-    nn.Module, HasInnerState, SupportsLoRA, SupportsPP, MixtureOfExperts, IsHybrid
+    nn.Module,
+    HasInnerState,
+    SupportsLoRA,
+    SupportsPP,
+    QwenNextMixtureOfExperts,
+    IsHybrid,
 ):
     packed_modules_mapping = {
         "qkv_proj": [
@@ -1147,63 +1231,7 @@ class Qwen3NextForCausalLM(
         )
 
         # Set MoE hyperparameters
-        self.expert_weights = []
-
-        self.moe_layers: list[SharedFusedMoE] = []
-        example_layer = None
-        for layer in self.model.layers:
-            if isinstance(layer, PPMissingLayer):
-                continue
-
-            assert isinstance(layer, Qwen3NextDecoderLayer)
-            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
-                example_layer = layer.mlp
-                self.moe_layers.append(layer.mlp.experts)
-
-        if example_layer is None:
-            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
-
-        self.num_moe_layers = len(self.moe_layers)
-        self.num_expert_groups = 1
-        self.num_shared_experts = 0
-        self.num_logical_experts = example_layer.n_logical_experts
-        self.num_physical_experts = example_layer.n_physical_experts
-        self.num_local_physical_experts = example_layer.n_local_physical_experts
-        self.num_routed_experts = example_layer.n_routed_experts
-        self.num_redundant_experts = example_layer.n_redundant_experts
-
-    def set_eplb_state(
-        self,
-        expert_load_view: torch.Tensor,
-        logical_to_physical_map: torch.Tensor,
-        logical_replica_count: torch.Tensor,
-    ) -> None:
-        for layer_idx, layer in enumerate(self.moe_layers):
-            # Register the expert weights.
-            self.expert_weights.append(layer.get_expert_weights())
-            layer.set_eplb_state(
-                moe_layer_idx=layer_idx,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
-
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for layer in self.model.layers:
-            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
-                moe = layer.mlp
-                moe.n_local_physical_experts = num_local_physical_experts
-                moe.n_physical_experts = num_physical_experts
-                moe.n_redundant_experts = self.num_redundant_experts
-                moe.experts.update_expert_map()
+        self.set_moe_parameters()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -1270,38 +1298,54 @@ class Qwen3NextForCausalLM(
         return self.model.get_expert_mapping()
 
 
-def gdn_attention(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
+def gdn_attention_core(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
     layer_name: str,
 ) -> None:
+    """
+    Custom op for the core attention computation.
+    Only handles the convolution + recurrent attention part.
+    Input/output projections are handled outside this op.
+    """
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self._forward(hidden_states=hidden_states, output=output)
+    self._forward_core(
+        mixed_qkv=mixed_qkv,
+        b=b,
+        a=a,
+        core_attn_out=core_attn_out,
+    )
 
 
-def gdn_attention_fake(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
+def gdn_attention_core_fake(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
     layer_name: str,
 ) -> None:
+    """Fake implementation for torch.compile."""
     return
 
 
 direct_register_custom_op(
-    op_name="gdn_attention",
-    op_func=gdn_attention,
-    mutates_args=["output"],
-    fake_impl=gdn_attention_fake,
+    op_name="gdn_attention_core",
+    op_func=gdn_attention_core,
+    mutates_args=["core_attn_out"],
+    fake_impl=gdn_attention_core_fake,
 )
 
 
-# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
 @triton.jit
 def fused_gdn_gating_kernel(
     g,
+    beta_output,
     A_log,
     a,
+    b,
     dt_bias,
     seq_len,
     NUM_HEADS: tl.constexpr,
@@ -1315,6 +1359,7 @@ def fused_gdn_gating_kernel(
     mask = head_off < NUM_HEADS
     blk_A_log = tl.load(A_log + head_off, mask=mask)
     blk_a = tl.load(a + off, mask=mask)
+    blk_b = tl.load(b + off, mask=mask)
     blk_bias = tl.load(dt_bias + head_off, mask=mask)
     # If the model is loaded in fp16, without the .float() here, A might be -inf
     x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
@@ -1323,20 +1368,44 @@ def fused_gdn_gating_kernel(
     )
     blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
     tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+    # compute beta_output = sigmoid(b)
+    blk_beta_output = tl.sigmoid(blk_b.to(tl.float32))
+    tl.store(
+        beta_output + off, blk_beta_output.to(beta_output.dtype.element_ty), mask=mask
+    )
 
 
 def fused_gdn_gating(
     A_log: torch.Tensor,
     a: torch.Tensor,
+    b: torch.Tensor,
     dt_bias: torch.Tensor,
     beta: float = 1.0,
     threshold: float = 20.0,
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fused computation of g and beta for Gated Delta Net.
+    g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+    beta_output = b.sigmoid()
+    TODO maybe use torch.compile to replace this triton kernel
+    """
     batch, num_heads = a.shape
     seq_len = 1
     grid = (batch, seq_len, triton.cdiv(num_heads, 8))
-    g = torch.empty_like(a, dtype=torch.float32)
+    g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
+    beta_output = torch.empty(1, batch, num_heads, dtype=b.dtype, device=b.device)
     fused_gdn_gating_kernel[grid](
-        g, A_log, a, dt_bias, seq_len, num_heads, beta, threshold, 8, num_warps=1
+        g,
+        beta_output,
+        A_log,
+        a,
+        b,
+        dt_bias,
+        seq_len,
+        num_heads,
+        beta,
+        threshold,
+        8,
+        num_warps=1,
     )
-    return g
+    return g, beta_output
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index a447484ae82a0..271b76adcff7e 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -23,6 +23,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen3_next import (
     Qwen3NextDecoderLayer,
     Qwen3NextRMSNorm,
+    QwenNextMixtureOfExperts,
 )
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Qwen3NextConfig
@@ -226,7 +227,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
 
 
 @support_torch_compile
-class Qwen3NextMTP(nn.Module, SupportsPP):
+class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -265,6 +266,7 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
+        self.set_moe_parameters()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index efcd003fbbda7..da489a812f55d 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -99,7 +99,6 @@ from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
     _merge_multimodal_embeddings,
-    flatten_bn,
     maybe_prefix,
 )
 from .vision import (
@@ -1065,8 +1064,6 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
         input_features = audio_input["input_features"]
         audio_feature_lengths = audio_input["audio_feature_lengths"]
 
-        audio_feature_lengths = flatten_bn(audio_feature_lengths, concat=True)
-
         audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
             audio_feature_lengths
         )
@@ -1420,8 +1417,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         image_grid_thw: list[list[int]] | torch.Tensor | None,
         video_grid_thw: list[list[int]] | torch.Tensor | None,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index d611580c71821..fe0124ef3258b 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -414,16 +414,10 @@ class Qwen3_VisionTransformer(nn.Module):
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw):
+    def rot_pos_emb(self, grid_thw: list[list[int]]):
         pos_ids = []
-        # Support both Tensor and list inputs for DP path
-        if isinstance(grid_thw, list):
-            grid_list = grid_thw
-            max_grid_size = max(max(h, w) for _, h, w in grid_list)
-        else:
-            grid_list = grid_thw.tolist()
-            max_grid_size = int(grid_thw[:, 1:].max().item())
-        for t, h, w in grid_list:
+        max_grid_size = max(max(h, w) for _, h, w in grid_thw)
+        for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             hpos_ids = hpos_ids.reshape(
                 h // self.spatial_merge_size,
@@ -527,24 +521,25 @@ class Qwen3_VisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
         hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
         hidden_states = self.patch_embed(hidden_states)
 
-        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
         rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
 
-        grid_thw_tensor = torch.tensor(grid_thw, dtype=torch.int32)
-
         cu_seqlens = torch.repeat_interleave(
-            grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], grid_thw_tensor[:, 0]
-        ).cumsum(
-            dim=0,
-            dtype=grid_thw_tensor.dtype if torch.jit.is_tracing() else torch.int32,
-        )
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
         cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
         hidden_states = hidden_states.unsqueeze(1)
@@ -1177,6 +1172,7 @@ class Qwen3VLForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
     merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
 
     packed_modules_mapping = {
         "qkv_proj": [
@@ -1356,7 +1352,6 @@ class Qwen3VLForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
@@ -1364,18 +1359,14 @@ class Qwen3VLForConditionalGeneration(
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
             if self.use_data_parallel:
                 return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                 )
             else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return image_embeds.split(sizes)
 
     def _process_video_input(
@@ -1383,7 +1374,6 @@ class Qwen3VLForConditionalGeneration(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1392,19 +1382,16 @@ class Qwen3VLForConditionalGeneration(
                 self.visual.dtype
             )
             if self.use_data_parallel:
+                grid_thw_list = grid_thw.tolist()
                 return run_dp_sharded_mrope_vision_model(
                     self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
                 )
             else:
-                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
-        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
         return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -1432,8 +1419,6 @@ class Qwen3VLForConditionalGeneration(
         hf_config: PretrainedConfig,
         image_grid_thw: list[list[int]] | torch.Tensor,
         video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
         second_per_grid_ts: list[float] | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
@@ -1532,7 +1517,7 @@ class Qwen3VLForConditionalGeneration(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
+
         return llm_positions, mrope_position_delta
 
     def get_language_model(self) -> torch.nn.Module:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7eca1a09e5365..4af8fa01f562b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -76,7 +76,7 @@ _TEXT_GENERATION_MODELS = {
     "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
-    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekForCausalLM": ("deepseek_v2", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "DeepseekV32ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
@@ -149,6 +149,8 @@ _TEXT_GENERATION_MODELS = {
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "OuroForCausalLM": ("ouro", "OuroForCausalLM"),
+    "PanguEmbeddedForCausalLM": ("openpangu", "PanguEmbeddedForCausalLM"),
+    "PanguUltraMoEForCausalLM": ("openpangu", "PanguUltraMoEForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
@@ -340,6 +342,10 @@ _MULTIMODAL_MODELS = {
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
     "Ovis2_5": ("ovis2_5", "Ovis2_5"),
+    "PaddleOCRVLForConditionalGeneration": (
+        "paddleocr_vl",
+        "PaddleOCRVLForConditionalGeneration",
+    ),
     "PaliGemmaForConditionalGeneration": (
         "paligemma",
         "PaliGemmaForConditionalGeneration",
@@ -402,6 +408,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "LongCatFlashMTPModel": ("longcat_flash_mtp", "LongCatFlashMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
+    "OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"),
     "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),
     # Temporarily disabled.
     # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index e363be523dcce..3cbdd64acc4a9 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -19,6 +19,7 @@ from transformers import (
 )
 
 from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -379,6 +380,7 @@ class SiglipAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -413,8 +415,11 @@ class SiglipAttention(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        self.attn = MultiHeadAttention(
-            self.num_heads_per_partition, self.head_dim, self.scale
+        self.attn = attn_cls(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            prefix=f"{prefix}.attn",
         )
 
     def forward(
@@ -424,25 +429,7 @@ class SiglipAttention(nn.Module):
         """Input shape: Batch x Time x Channel"""
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
-
-        needs_unsqueeze = query_states.ndim == 2
-        if needs_unsqueeze:
-            query_states, key_states, value_states = (
-                query_states.unsqueeze(0),
-                key_states.unsqueeze(0),
-                value_states.unsqueeze(0),
-            )
-
         out = self.attn(query_states, key_states, value_states)
-
-        if needs_unsqueeze:
-            out, query_states, key_states, value_states = (
-                out.squeeze(0),
-                query_states.squeeze(0),
-                key_states.squeeze(0),
-                value_states.squeeze(0),
-            )
-
         attn_output, _ = self.out_proj(out)
 
         return attn_output, None
@@ -495,6 +482,7 @@ class SiglipEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -504,6 +492,7 @@ class SiglipEncoderLayer(nn.Module):
             config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            attn_cls=attn_cls,
         )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -539,6 +528,7 @@ class SiglipEncoder(nn.Module):
         num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
+        attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -555,6 +545,7 @@ class SiglipEncoder(nn.Module):
                     config,
                     quant_config=quant_config,
                     prefix=f"{prefix}.layers.{layer_idx}",
+                    attn_cls=attn_cls,
                 )
                 for layer_idx in range(num_hidden_layers)
             ]
@@ -598,6 +589,7 @@ class SiglipTextTransformer(nn.Module):
             config=config,
             quant_config=quant_config,
             prefix=f"{prefix}.encoder",
+            attn_cls=EncoderOnlyAttention,
         )
 
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@@ -709,6 +701,7 @@ class SiglipVisionTransformer(nn.Module):
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
             prefix=f"{prefix}.encoder",
+            attn_cls=MultiHeadAttention,
         )
 
         num_hidden_layers = config.num_hidden_layers
@@ -1034,10 +1027,56 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
             inputs_embeds=inputs_embeds,
         )
         text_features = self.text_model.head(last_hidden_state)
-        # Flip to extract CLS token (first token after reversal) for pooling
-        text_features = text_features.flip(0)
+
+        # SigLIP uses reversed position_ids;
+        # flip sequences to move EOS token to first position
+        text_features = self._flip_sequences_by_position_ids(
+            text_features, position_ids
+        )
+
         return text_features
 
+    def _flip_sequences_by_position_ids(
+        self,
+        features: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Flip sequences so EOS token moves to first position for CLS pooling.
+
+        SigLIP position_ids are reversed within each sequence. This method detects
+        sequence boundaries and flips each sequence individually.
+        """
+        if len(features) == 1:
+            return features
+
+        # Detect sequence boundaries where position_ids decrease
+        position_diffs = position_ids[1:] - position_ids[:-1]
+        boundary_mask = position_diffs <= 0
+
+        boundary_indices = torch.cat(
+            [
+                torch.tensor([0], device=features.device),
+                torch.where(boundary_mask)[0] + 1,
+                torch.tensor([len(features)], device=features.device),
+            ]
+        )
+
+        # For each sequence [start, end), position i flips to: start + end - 1 - i
+        lengths = boundary_indices[1:] - boundary_indices[:-1]
+        starts = boundary_indices[:-1]
+        ends = boundary_indices[1:]
+
+        # Assign sequence ID to each element
+        sequence_ids = torch.arange(
+            len(lengths), device=features.device
+        ).repeat_interleave(lengths)
+
+        # Calculate flipped indices for all positions at once
+        current_positions = torch.arange(len(features), device=features.device)
+        flip_indices = starts[sequence_ids] + ends[sequence_ids] - 1 - current_positions
+
+        return features[flip_indices]
+
     def get_image_features(
         self,
         pixel_values: torch.Tensor,
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 41d170c9e1397..eb992f7bec72b 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -28,6 +28,7 @@ from transformers import AutoModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config.utils import getattr_iter
 from vllm.distributed import get_pp_group, get_tp_group
 from vllm.distributed.utils import get_pp_indices
@@ -317,7 +318,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
         # vLLM does not support encoder-decoder models, so if any encoder layer is
         # found in a text only model, we assume the whole model is an encoder model
         if has_encoder(self.model) and not is_multimodal(self.config):
-            self.check_version("4.57.0.dev0", "encoder models support")
+            self.check_version("5.0.0.dev0", "encoder models support")
             attn_type = AttentionType.ENCODER_ONLY
         else:
             attn_type = AttentionType.DECODER
@@ -336,7 +337,12 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
             ):
                 per_layer_sliding_window = self.config.sliding_window
 
-            attention_instances[i] = Attention(
+            attn_cls = (
+                EncoderOnlyAttention
+                if attn_type == AttentionType.ENCODER_ONLY
+                else Attention
+            )
+            attention_instances[i] = attn_cls(
                 num_heads=num_heads,
                 head_size=head_size,
                 # NOTE: We use Llama scale as default, if it's set by
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 5de786f99580f..8e39eb0b9902c 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -115,7 +115,7 @@ direct_register_custom_op(
 
 class MoEMixin(MixtureOfExperts):
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
-        self.check_version("4.57.0.dev0", "MoE models support")
+        self.check_version("5.0.0.dev0", "MoE models support")
         # Skip MixtureOfExperts.__init__ and call the next class in MRO
         super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
 
@@ -125,7 +125,7 @@ class MoEMixin(MixtureOfExperts):
         logical_to_physical_map: torch.Tensor,
         logical_replica_count: torch.Tensor,
     ):
-        for moe_layer_idx, mlp_layer in enumerate(self.mlp_layers):
+        for moe_layer_idx, mlp_layer in enumerate(self.mlp_moe_layers):
             mlp_layer.experts.set_eplb_state(
                 moe_layer_idx=moe_layer_idx,
                 expert_load_view=expert_load_view,
@@ -142,7 +142,7 @@ class MoEMixin(MixtureOfExperts):
         self.num_physical_experts = num_physical_experts
         self.num_local_physical_experts = num_local_physical_experts
         self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for mlp in self.mlp_layers:
+        for mlp in self.mlp_moe_layers:
             mlp.n_local_physical_experts = num_local_physical_experts
             mlp.n_physical_experts = num_physical_experts
             mlp.n_redundant_experts = self.num_redundant_experts
@@ -240,7 +240,8 @@ class MoEMixin(MixtureOfExperts):
         # MixtureOfExperts mixin settings
         ep_size = get_ep_group().world_size
 
-        self.mlp_layers = []  # Used for MixtureOfExperts methods
+        self.mlp_moe_layers = []  # Used for MixtureOfExperts methods
+        self.moe_layers = []
         self.expert_weights = []
         self.num_moe_layers = 0
         self.num_expert_groups = 1 if num_expert_group is None else num_expert_group
@@ -298,7 +299,8 @@ class MoEMixin(MixtureOfExperts):
                     mlp.experts = fused_experts
                     log_replacement(qual_name, experts, fused_experts)
                     # Update MixtureOfExperts mixin state
-                    self.mlp_layers.append(mlp)
+                    self.mlp_moe_layers.append(mlp)
+                    self.moe_layers.append(fused_experts)
                     self.expert_weights.append(fused_experts.get_expert_weights())
                     self.num_moe_layers += 1
                     # If results are not all-reduced in FusedMoE, ensure they
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 10abd86595360..476074542e6ae 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -371,8 +371,6 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
         image_grid_thw: list[list[int]] | torch.Tensor | None,
         video_grid_thw: list[list[int]] | torch.Tensor | None,
         second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
         audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
@@ -390,7 +388,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
             video_grid_thw=video_grid_thw,
         )
 
-        mrope_positions = mrope_positions[:, 0, context_len:seq_len]
+        mrope_positions = mrope_positions[:, 0]
         mrope_position_delta = mrope_position_delta[0].item()
 
         return mrope_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0690788502171..e5ebd8138b0ac 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -12,7 +12,6 @@ from torch.func import functional_call
 from transformers import PretrainedConfig
 from typing_extensions import deprecated
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
@@ -576,11 +575,8 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
     pin_memory = is_pin_memory_available()
     uva_available = is_uva_available()
 
-    if envs.VLLM_USE_V1:
-        assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
-        uva_offloading = True
-    else:
-        uva_offloading = False
+    assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
+    uva_offloading = True
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index b5f6c60514c09..9f94387c700d6 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -11,6 +11,7 @@ import torch
 from transformers import PretrainedConfig
 
 from vllm.attention.backends.registry import _Backend
+from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -100,6 +101,11 @@ def get_vit_attn_backend(
     return current_platform.get_vit_attn_backend(head_size, dtype)
 
 
+def should_torch_compile_mm_vit(vllm_config: VllmConfig) -> bool:
+    """Callable to be passed to `@support_torch_compile`'s `enable_if` argument."""
+    return vllm_config.compilation_config.compile_mm_encoder
+
+
 VisionFeatureSelectStrategyStr = Literal["class", "default", "full"]
 
 VisionFeatureSelectStrategy: TypeAlias = (
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index a6cfcf509776f..bc1351600a2f4 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -75,7 +75,12 @@ class Zamba2LoRA(nn.Module):
         super().__init__()
 
         self.A = ColumnParallelLinear(
-            input_dim, rank, bias=False, quant_config=quant_config, gather_output=True
+            input_dim,
+            rank,
+            bias=False,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=f"{prefix}.A",
         )
 
         if isinstance(output_dim, list):
@@ -150,12 +155,14 @@ class Zamba2Attention(nn.Module):
             self.total_num_attention_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.attention_hidden_size,
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         # Even though in Zamba2 weights are shared between attention layers, KV
@@ -197,18 +204,21 @@ class Zamba2Attention(nn.Module):
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_q_adapter",
                     )
                     linear_k_adapter = Zamba2LoRA(
                         self.attention_hidden_size,
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_k_adapter",
                     )
                     linear_v_adapter = Zamba2LoRA(
                         self.attention_hidden_size,
                         config.adapter_rank,
                         self.attention_hidden_size,
                         quant_config=quant_config,
+                        prefix=f"{prefix}.linear_v_adapter",
                     )
                 else:
                     linear_q_adapter = nn.Identity()
@@ -312,6 +322,7 @@ class Zamba2MLP(nn.Module):
             2 * [self.intermediate_size],  # 2x for gate and input projections
             bias=self.config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
 
         self.down_proj = RowParallelLinear(
@@ -319,6 +330,7 @@ class Zamba2MLP(nn.Module):
             self.hidden_size,
             bias=self.config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
         # Only allow GELU activations
@@ -418,6 +430,7 @@ class Zamba2AttentionDecoderLayer(nn.Module):
             bare_block_idx=bare_block_idx,
             num_hybrid_layers=num_hybrid_layers,
             quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
         )
 
         # Initialize layer normalizations
@@ -599,6 +612,7 @@ class Zamba2HybridLayer(nn.Module):
             config.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.linear",
         )
         self.mamba_decoder = Zamba2MambaDecoderLayer(
             config,
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index 78cbcd8e5427f..e0c584df8760b 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -13,7 +13,7 @@ import vllm.envs as envs
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
     TritonOrDeepGemmExperts,
@@ -148,6 +148,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
 
 
 def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
+    if not (envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM):
+        return False
+
     if not isinstance(module, FusedMoE):
         return False
 
@@ -160,8 +163,8 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
     ):
         return False
 
-    if not isinstance(module.quant_method.fused_experts, FusedMoEModularKernel):
-        # fused_experts could invoke deep_gemm_moe_fp8
+    if not isinstance(module.quant_method, FusedMoEModularMethod):
+        # modular kernels could invoke deep_gemm_moe_fp8
         return True
 
     mk: FusedMoEModularKernel = module.quant_method.fused_experts
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index ffa3bc8f021ef..28792338f036f 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -11,7 +11,6 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
-from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
 from vllm.platforms import current_platform
@@ -25,26 +24,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
-    """
-    Record known issues with vllm + flashinfer autotune here. Return True if
-    and only if flashinfer autotune will run through without issues.
-    """
-    is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or (
-        vllm_config.parallel_config.tensor_parallel_size > 1
-    )
-    is_fi_mxfp4_backend = (
-        envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
-        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
-    ) or (
-        current_platform.is_cuda() and current_platform.is_device_capability(100)
-    )  # on >=sm100, default mxfp4 backend is flashinfer
-    is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-
-    return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)
-
-
 def kernel_warmup(worker: "Worker"):
     # Deep GEMM warmup
     do_deep_gemm_warmup = (
@@ -58,11 +37,7 @@ def kernel_warmup(worker: "Worker"):
         deep_gemm_warmup(model, max_tokens)
 
     # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs
-    if (
-        has_flashinfer()
-        and current_platform.has_device_capability(90)
-        and flashinfer_autotune_supported(worker.vllm_config)
-    ):
+    if has_flashinfer() and current_platform.has_device_capability(90):
         flashinfer_autotune(worker.model_runner)
 
     # FlashInfer attention warmup
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index b864c52dfbc8b..cb70041e9744f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -9,7 +9,6 @@ import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-import vllm.envs as envs
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -306,18 +305,6 @@ class MultiModalProfiler(Generic[_I]):
         if processor.pad_dummy_encoder_prompt:
             num_tokens_to_pad = max(total_len, seq_len) - total_len
             encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
-        # NOTE: Whisper allows total_len > seq_len.
-        elif total_len > seq_len and not envs.VLLM_USE_V1:
-            # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning_once(
-                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
-                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
-                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
-                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
-                seq_len,
-                total_len,
-                str(self._get_mm_num_tokens(mm_inputs)),
-            )
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7f259dad08f90..3f55c46ca334d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import atexit
-from collections.abc import Iterable
+from collections.abc import Iterable, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -20,6 +20,7 @@ import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -46,7 +47,10 @@ atexit.register(global_thread_pool.shutdown)
 
 _M = TypeVar("_M")
 
+MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
 
+
+@MEDIA_CONNECTOR_REGISTRY.register("http")
 class MediaConnector:
     def __init__(
         self,
@@ -398,6 +402,7 @@ def group_mm_kwargs_by_modality(
     device: torch.types.Device = None,
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
+    multimodal_cpu_fields: Set[str] = frozenset(),
 ) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -439,10 +444,17 @@ def group_mm_kwargs_by_modality(
             )
 
             if device is not None:
-                mm_kwargs_group = json_map_leaves(
-                    lambda x: x.to(device=device) if isinstance(x, torch.Tensor) else x,
-                    mm_kwargs_group,
-                )
+                mm_kwargs_group = {
+                    k: json_map_leaves(
+                        lambda x: x.to(device=device, non_blocking=True)
+                        if isinstance(x, torch.Tensor)
+                        else x,
+                        v,
+                    )
+                    if k not in multimodal_cpu_fields
+                    else v
+                    for k, v in mm_kwargs_group.items()
+                }
         else:
             mm_kwargs_group = MultiModalKwargs.as_kwargs(
                 MultiModalKwargs.batch(
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 666ef275a9247..369c5e6cb4d10 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -14,6 +14,7 @@ from PIL import Image
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.utils.registry import ExtensionManager
 
 from .base import MediaIO
 from .image import ImageMediaIO
@@ -63,25 +64,7 @@ class VideoLoader:
         raise NotImplementedError
 
 
-class VideoLoaderRegistry:
-    def __init__(self) -> None:
-        self.name2class: dict[str, type] = {}
-
-    def register(self, name: str):
-        def wrap(cls_to_register):
-            self.name2class[name] = cls_to_register
-            return cls_to_register
-
-        return wrap
-
-    @staticmethod
-    def load(cls_name: str) -> VideoLoader:
-        cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name)
-        assert cls is not None, f"VideoLoader class {cls_name} not found"
-        return cls()
-
-
-VIDEO_LOADER_REGISTRY = VideoLoaderRegistry()
+VIDEO_LOADER_REGISTRY = ExtensionManager()
 
 
 @VIDEO_LOADER_REGISTRY.register("opencv")
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 4b9f4aef022d0..ee904535ffe8d 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING
 import regex as re
 import torch
 
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
@@ -151,7 +152,6 @@ class CpuPlatform(Platform):
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
-        import vllm.envs as envs
         from vllm.utils.mem_constants import GiB_bytes
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
@@ -289,17 +289,22 @@ class CpuPlatform(Platform):
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
         # Note: to avoid the error 'nthreads cannot be larger than environment
-        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        # variable "NUMEXPR_MAX_THREADS" (64)'.
         os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
 
-        # Set default threads num for OpenMP parallel
-        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+        if envs.VLLM_CPU_OMP_THREADS_BIND != "nobind":
+            # Set default threads num for OpenMP parallel
+            os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+        else:
+            # In this case, setting the OpenMP configuration via
+            # OMP_NUM_THREADS is up to the user.
+            logger.info("Disabling binding processes to CPU cores...")
 
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
         # Disable multi-stream for shared experts as no Stream on CPU
-        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"
+        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
 
         # Intel OpenMP setting
         ld_preload_str = os.getenv("LD_PRELOAD", "")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index cc06f034fba32..32734c3aba5ef 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -276,17 +276,12 @@ class CudaPlatformBase(Platform):
                     "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
                     "VLLM_MLA_DISABLE=1 to disable MLA for this model."
                 )
-            if not use_v1:
-                raise RuntimeError(
-                    "MLA attention backends require the V1 engine. "
-                    "Set VLLM_USE_V1=1 to enable them."
-                )
 
             from vllm.attention.ops.flashmla import is_flashmla_dense_supported
             from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 
             if use_sparse:
-                logger.info_once("Using Sparse MLA backend on V1 engine.")
+                logger.info_once("Using Sparse MLA backend.")
                 return (
                     "vllm.v1.attention.backends.mla.flashmla_sparse."
                     "FlashMLASparseBackend"
@@ -313,15 +308,13 @@ class CudaPlatformBase(Platform):
             )
 
             if use_cutlassmla:
-                logger.info_once(
-                    "Using Cutlass MLA backend on V1 engine.", scope="local"
-                )
+                logger.info_once("Using Cutlass MLA backend.", scope="local")
                 return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
             if use_flashinfermla:
                 from vllm.v1.attention.backends.utils import set_kv_cache_layout
 
                 set_kv_cache_layout("HND")
-                logger.info_once("Using FlashInfer MLA backend on V1 engine.")
+                logger.info_once("Using FlashInfer MLA backend.")
                 return (
                     "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
                 )
@@ -333,116 +326,107 @@ class CudaPlatformBase(Platform):
                         block_size,
                     )
                 else:
-                    logger.info_once("Using FlashMLA backend on V1 engine.")
+                    logger.info_once("Using FlashMLA backend.")
                     return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
             if use_flashattn:
-                logger.info_once("Using FlashAttention MLA backend on V1 engine.")
+                logger.info_once("Using FlashAttention MLA backend.")
                 return (
                     "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
                 )
             if use_triton:
-                logger.info_once("Using Triton MLA backend on V1 engine.")
+                logger.info_once("Using Triton MLA backend.")
                 return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
-        if use_v1:
-            FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
-            FLEX_ATTENTION_V1 = (
-                "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
-            )
-            TRITON_ATTN = (
-                "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-            )
-            FLASH_ATTN_V1 = (
-                "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-            )
-            TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
-            XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
 
-            use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith(
-                "fp8"
-            )
+        FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
+        FLEX_ATTENTION_V1 = (
+            "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+        )
+        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+        FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+        TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
+        XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
 
-            if selected_backend == _Backend.FLASHINFER:
-                logger.info_once("Using FlashInfer backend on V1 engine.")
-                if cls.has_device_capability(100):
-                    from vllm.v1.attention.backends.utils import set_kv_cache_layout
+        use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith(
+            "fp8"
+        )
+
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info_once("Using FlashInfer backend.")
+            if cls.has_device_capability(100):
+                from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+                set_kv_cache_layout("HND")
+            return FLASHINFER_V1
+        elif selected_backend == _Backend.FLEX_ATTENTION:
+            logger.info_once("Using FlexAttention backend.")
+            return FLEX_ATTENTION_V1
+        elif selected_backend == _Backend.TRITON_ATTN:
+            logger.info_once("Using Triton backend.")
+            return TRITON_ATTN
+        elif selected_backend == _Backend.FLASH_ATTN:
+            logger.info_once("Using Flash Attention backend.")
+            return FLASH_ATTN_V1
+        elif selected_backend == _Backend.TREE_ATTN:
+            logger.info_once("Using Tree Attention backend.")
+            return TREE_ATTN_V1
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info_once("Using XFormers backend.")
+            return XFORMERS_V1
+
+        from vllm.attention.selector import is_attn_backend_supported
+
+        # Default backends for V1 engine
+        # Prefer FlashInfer for Blackwell GPUs if installed
+        if cls.is_device_capability(100):
+            if is_default_backend_supported := is_attn_backend_supported(
+                FLASHINFER_V1, head_size, dtype
+            ):
+                from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+                logger.info_once(
+                    "Using FlashInfer backend with HND KV cache layout on "
+                    "V1 engine by default for Blackwell (SM 10.0) GPUs."
+                )
+                set_kv_cache_layout("HND")
 
-                    set_kv_cache_layout("HND")
                 return FLASHINFER_V1
-            elif selected_backend == _Backend.FLEX_ATTENTION:
-                logger.info_once("Using FlexAttention backend on V1 engine.")
-                return FLEX_ATTENTION_V1
-            elif selected_backend == _Backend.TRITON_ATTN:
-                logger.info_once("Using Triton backend on V1 engine.")
+
+            if not is_default_backend_supported.can_import:
+                logger.warning_once(
+                    "FlashInfer failed to import on Blackwell (SM 10.0) GPUs; "
+                    "it is recommended to install FlashInfer for better "
+                    "performance."
+                )
+
+        # FlashAttention is the default for SM 8.0+ GPUs
+        if cls.has_device_capability(80):
+            if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
+                logger.info_once("Using Triton backend.")
                 return TRITON_ATTN
-            elif selected_backend == _Backend.FLASH_ATTN:
-                logger.info_once("Using Flash Attention backend on V1 engine.")
+            elif is_default_backend_supported := is_attn_backend_supported(
+                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
+            ):
+                logger.info_once("Using Flash Attention backend.")
                 return FLASH_ATTN_V1
-            elif selected_backend == _Backend.TREE_ATTN:
-                logger.info_once("Using Tree Attention backend on V1 engine.")
-                return TREE_ATTN_V1
-            elif selected_backend == _Backend.XFORMERS:
-                logger.info_once("Using XFormers backend on V1 engine.")
-                return XFORMERS_V1
 
-            from vllm.attention.selector import is_attn_backend_supported
-
-            # Default backends for V1 engine
-            # Prefer FlashInfer for Blackwell GPUs if installed
-            if cls.is_device_capability(100):
-                if is_default_backend_supported := is_attn_backend_supported(
-                    FLASHINFER_V1, head_size, dtype
-                ):
-                    from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                    logger.info_once(
-                        "Using FlashInfer backend with HND KV cache layout on "
-                        "V1 engine by default for Blackwell (SM 10.0) GPUs."
-                    )
-                    set_kv_cache_layout("HND")
-
-                    return FLASHINFER_V1
-
-                if not is_default_backend_supported.can_import:
-                    logger.warning_once(
-                        "FlashInfer failed to import for V1 engine on "
-                        "Blackwell (SM 10.0) GPUs; it is recommended to "
-                        "install FlashInfer for better performance."
-                    )
-
-            # FlashAttention is the default for SM 8.0+ GPUs
-            if cls.has_device_capability(80):
-                if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
-                    logger.info_once("Using Triton backend on V1 engine.")
-                    return TRITON_ATTN
-                elif is_default_backend_supported := is_attn_backend_supported(
-                    FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
-                ):
-                    logger.info_once("Using Flash Attention backend on V1 engine.")
-                    return FLASH_ATTN_V1
-
-            # FlexAttention is the default for older GPUs
-            else:
-                logger.info_once("Using FlexAttention backend on V1 engine.")
-                return FLEX_ATTENTION_V1
-
-            assert not is_default_backend_supported
-
-            use_flex_attention_reason = {}
-            if not is_default_backend_supported.head_size:
-                use_flex_attention_reason["head_size"] = head_size
-            if not is_default_backend_supported.dtype:
-                use_flex_attention_reason["dtype"] = dtype
-
-            logger.info_once(
-                "Using FlexAttention backend for %s on V1 engine.",
-                ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
-            )
+        # FlexAttention is the default for older GPUs
+        else:
+            logger.info_once("Using FlexAttention backend.")
             return FLEX_ATTENTION_V1
 
-        raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend."
+        assert not is_default_backend_supported
+
+        use_flex_attention_reason = {}
+        if not is_default_backend_supported.head_size:
+            use_flex_attention_reason["head_size"] = head_size
+        if not is_default_backend_supported.dtype:
+            use_flex_attention_reason["dtype"] = dtype
+
+        logger.info_once(
+            "Using FlexAttention backend for %s.",
+            ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
         )
+        return FLEX_ATTENTION_V1
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4462829564391..15e3b3a22bdee 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -467,14 +467,7 @@ class Platform:
         """
         Whether to use allgather in LogitsProcessor to gather the logits.
         """
-        import vllm.envs as envs
-        from vllm.config import get_current_vllm_config
-
-        parallel_config = get_current_vllm_config().parallel_config
-        return (
-            envs.VLLM_USE_V1
-            or parallel_config.distributed_executor_backend == "external_launcher"
-        )
+        return True
 
     @classmethod
     def use_custom_allreduce(cls) -> bool:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d3535c9781c48..e6536a02a73dd 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -77,6 +77,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
     "0x74b9": "AMD_Instinct_MI325X",  # MI325X VF
     "0x74a9": "AMD_Instinct_MI300X_HF",
     "0x74bd": "AMD_Instinct_MI300X_HF",
+    "0x744c": "AMD_Radeon_RX7900XTX",
 }
 
 # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
@@ -141,6 +142,8 @@ def use_rocm_custom_paged_attention(
     alibi_slopes: torch.Tensor | None = None,
     sinks: torch.Tensor | None = None,
 ) -> bool:
+    from vllm._aiter_ops import rocm_aiter_ops
+
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
     ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
@@ -149,25 +152,21 @@ def use_rocm_custom_paged_attention(
     # disabled due to observed numerical discrepancy.
     if ON_GFX9:
         return (
-            (not envs.VLLM_USE_V1 or sliding_window == 0 or sliding_window == (-1, -1))
+            (sliding_window == 0 or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16)
             and max_seq_len <= 128 * 1024
             and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN and envs.VLLM_ROCM_USE_AITER)
+            and not (rocm_aiter_ops.is_pa_attn_enabled())
             and sinks is None
         )
 
     else:
         return (
             ON_GFX11_GFX12
-            and (
-                not envs.VLLM_USE_V1
-                or sliding_window == 0
-                or sliding_window == (-1, -1)
-            )
+            and (sliding_window == 0 or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and head_size == 128
             and block_size == 16
@@ -205,12 +204,15 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
+    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
         from importlib.util import find_spec
 
+        from vllm._aiter_ops import rocm_aiter_ops
         from vllm.attention.backends.registry import _Backend
 
-        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+        if rocm_aiter_ops.is_mha_enabled():
+            # Note: AITER FA is only supported for Qwen-VL models.
+            # TODO: Add support for other VL models in their model class.
             return _Backend.ROCM_AITER_FA
 
         if on_gfx9() and find_spec("flash_attn") is not None:
@@ -231,87 +233,70 @@ class RocmPlatform(Platform):
         has_sink,
         use_sparse,
     ) -> str:
+        from vllm._aiter_ops import rocm_aiter_ops
         from vllm.attention.backends.registry import _Backend
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on ROCm.")
-        if use_mla:
-            if not use_v1:
-                raise RuntimeError(
-                    "MLA attention backends require the V1 engine. "
-                    "Set VLLM_USE_V1=1 to enable them."
-                )
 
-            from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
-                is_aiter_mla_enabled,
+        if not use_v1:
+            raise RuntimeError(
+                "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
+                "to select a supported backend."
             )
 
+        if use_mla:
             if selected_backend is None:
                 selected_backend = (
                     _Backend.ROCM_AITER_MLA
-                    if is_aiter_mla_enabled() or block_size == 1
+                    if rocm_aiter_ops.is_mla_enabled() or block_size == 1
                     else _Backend.TRITON_MLA
                 )
 
             if selected_backend == _Backend.TRITON_MLA:
                 if block_size != 1:
-                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    logger.info_once("Using Triton MLA backend.")
                     return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
                 raise ValueError(
                     f" The selected backend, {selected_backend.name},"
                     f"does not support block size {block_size}."
                 )
             if selected_backend == _Backend.ROCM_AITER_MLA:
-                if block_size == 1:
-                    logger.info("Using AITER MLA backend on V1 engine.")
-                    return (
-                        "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
-                    )
-                raise ValueError(
-                    f" The selected backend, {selected_backend.name},"
-                    f"does not support block size {block_size}."
-                    "(currently only supports block size 1)"
-                )
+                logger.info("Using AITER MLA backend.")
+                return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if envs.VLLM_USE_V1:
-            if selected_backend == _Backend.FLEX_ATTENTION:
-                logger.info("Using FlexAttention backend on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
-            if (
-                envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9()
-            ) or selected_backend == _Backend.ROCM_AITER_FA:
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
-                return (
-                    "vllm.v1.attention.backends."
-                    "rocm_aiter_fa.AiterFlashAttentionBackend"
-                )
-            if (
-                envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
-            ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
-                logger.info("Using Aiter Unified Attention backend on V1 engine.")
-                return (
-                    "vllm.v1.attention.backends."
-                    "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
-                )
-            if (
-                envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-                or selected_backend == _Backend.ROCM_ATTN
-            ):
-                # rocm specific backend, with aiter and/or
-                #   triton prefix-prefill
-                logger.info("Using Rocm Attention backend on V1 engine.")
-                return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
-            # default case, using triton unified attention
-            logger.info("Using Triton Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
-        raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend."
-        )
+        if selected_backend == _Backend.FLEX_ATTENTION:
+            logger.info("Using FlexAttention backend.")
+            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+        if (
+            rocm_aiter_ops.is_mha_enabled()
+        ) or selected_backend == _Backend.ROCM_AITER_FA:
+            logger.info("Using Aiter Flash Attention backend.")
+            return "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+        if (
+            rocm_aiter_ops.is_triton_unified_attn_enabled()
+        ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+            logger.info("Using Aiter Unified Attention backend.")
+            return (
+                "vllm.v1.attention.backends."
+                "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
+            )
+        if (
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
+            or selected_backend == _Backend.ROCM_ATTN
+        ):
+            # rocm specific backend, with aiter and/or
+            #   triton prefix-prefill
+            logger.info("Using Rocm Attention backend.")
+            return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+        # default case, using triton unified attention
+        logger.info("Using Triton Attention backend.")
+        return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -372,7 +357,6 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
-        use_v1 = envs.VLLM_USE_V1
         use_aiter_rms_norm = (
             envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM
         )
@@ -384,8 +368,7 @@ class RocmPlatform(Platform):
             parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
         #  Aiter rms norm perform best when CUDA Graph capture is enabled.
         if (
-            use_v1
-            and use_aiter_rms_norm
+            use_aiter_rms_norm
             and not is_eager_execution
             and "-rms_norm" not in compilation_config.custom_ops
         ):
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0a14ee011f7f2..1a4b67a1762f3 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -204,10 +204,6 @@ class TpuPlatform(Platform):
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
 
-    @classmethod
-    def use_all_gather(cls) -> bool:
-        return True
-
     @classmethod
     def validate_request(
         cls,
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 07ab759e4baa6..e4ecd0c807dac 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -66,16 +66,13 @@ class XPUPlatform(Platform):
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
-        use_v1 = envs.VLLM_USE_V1
-        if not use_v1:
-            raise ValueError("XPU backend only supports V1.")
         TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
         FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
         if selected_backend == _Backend.TRITON_ATTN:
-            logger.info_once("Using Triton backend on V1 engine.")
+            logger.info_once("Using Triton backend.")
             return TRITON_ATTN
         elif selected_backend == _Backend.FLASH_ATTN:
-            logger.info_once("Using Flash Attention backend on V1 engine.")
+            logger.info_once("Using Flash Attention backend.")
             return FLASH_ATTN
         elif selected_backend:
             raise ValueError(
@@ -83,7 +80,7 @@ class XPUPlatform(Platform):
                 f"with use_v1: {use_v1} use_mla: {use_mla}"
             )
 
-        logger.info("Using Flash Attention backend on V1 engine.")
+        logger.info("Using Flash Attention backend.")
         return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
 
     @classmethod
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
new file mode 100644
index 0000000000000..58c6689531615
--- /dev/null
+++ b/vllm/profiler/gpu_profiler.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class CudaProfilerWrapper:
+    def __init__(self) -> None:
+        self._profiler_running = False
+        # Note: lazy import to avoid dependency issues if CUDA is not available.
+        import torch.cuda.profiler as cuda_profiler
+
+        self._cuda_profiler = cuda_profiler
+
+    def start(self) -> None:
+        try:
+            self._cuda_profiler.start()
+            self._profiler_running = True
+            logger.info_once("Started CUDA profiler")
+        except Exception as e:
+            logger.warning_once("Failed to start CUDA profiler: %s", e)
+
+    def stop(self) -> None:
+        if self._profiler_running:
+            try:
+                self._cuda_profiler.stop()
+                logger.info_once("Stopped CUDA profiler")
+            except Exception as e:
+                logger.warning_once("Failed to stop CUDA profiler: %s", e)
+            finally:
+                self._profiler_running = False
+
+    def shutdown(self) -> None:
+        """Ensure profiler is stopped when shutting down."""
+        self.stop()
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 3d666882efb59..36e58dba6b497 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -1,39 +1,92 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
-from .basic_parsers import BaseThinkingReasoningParser
-from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
-from .ernie45_reasoning_parser import Ernie45ReasoningParser
-from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
-from .gptoss_reasoning_parser import GptOssReasoningParser
-from .granite_reasoning_parser import GraniteReasoningParser
-from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
-from .identity_reasoning_parser import IdentityReasoningParser
-from .minimax_m2_reasoning_parser import MiniMaxM2ReasoningParser
-from .mistral_reasoning_parser import MistralReasoningParser
-from .olmo3_reasoning_parser import Olmo3ReasoningParser
-from .qwen3_reasoning_parser import Qwen3ReasoningParser
-from .seedoss_reasoning_parser import SeedOSSReasoningParser
-from .step3_reasoning_parser import Step3ReasoningParser
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 
 __all__ = [
     "ReasoningParser",
-    "BaseThinkingReasoningParser",
     "ReasoningParserManager",
-    "DeepSeekR1ReasoningParser",
-    "IdentityReasoningParser",
-    "DeepSeekV3ReasoningParser",
-    "Ernie45ReasoningParser",
-    "GraniteReasoningParser",
-    "HunyuanA13BReasoningParser",
-    "Qwen3ReasoningParser",
-    "Glm4MoeModelReasoningParser",
-    "MistralReasoningParser",
-    "Olmo3ReasoningParser",
-    "Step3ReasoningParser",
-    "GptOssReasoningParser",
-    "SeedOSSReasoningParser",
-    "MiniMaxM2ReasoningParser",
 ]
+"""
+Register a lazy module mapping.
+
+Example:
+    ReasoningParserManager.register_lazy_module(
+        name="qwen3",
+        module_path="vllm.reasoning.qwen3_reasoning_parser",
+        class_name="Qwen3ReasoningParser",
+    )
+"""
+
+
+_REASONING_PARSERS_TO_REGISTER = {
+    "deepseek_r1": (  # name
+        "deepseek_r1_reasoning_parser",  # filename
+        "DeepSeekR1ReasoningParser",  # class_name
+    ),
+    "deepseek_v3": (
+        "deepseek_v3_reasoning_parser",
+        "DeepSeekV3ReasoningParser",
+    ),
+    "ernie45": (
+        "ernie45_reasoning_parser",
+        "Ernie45ReasoningParser",
+    ),
+    "glm45": (
+        "glm4_moe_reasoning_parser",
+        "Glm4MoeModelReasoningParser",
+    ),
+    "openai_gptoss": (
+        "gptoss_reasoning_parser",
+        "GptOssReasoningParser",
+    ),
+    "granite": (
+        "granite_reasoning_parser",
+        "GraniteReasoningParser",
+    ),
+    "hunyuan_a13b": (
+        "hunyuan_a13b_reasoning_parser",
+        "HunyuanA13BReasoningParser",
+    ),
+    "kimi_k2": (
+        "deepseek_r1_reasoning_parser",
+        "DeepSeekR1ReasoningParser",
+    ),
+    "minimax_m2": (
+        "minimax_m2_reasoning_parser",
+        "MiniMaxM2ReasoningParser",
+    ),
+    "minimax_m2_append_think": (
+        "minimax_m2_reasoning_parser",
+        "MiniMaxM2AppendThinkReasoningParser",
+    ),
+    "mistral": (
+        "mistral_reasoning_parser",
+        "MistralReasoningParser",
+    ),
+    "olmo3": (
+        "olmo3_reasoning_parser",
+        "Olmo3ReasoningParser",
+    ),
+    "qwen3": (
+        "qwen3_reasoning_parser",
+        "Qwen3ReasoningParser",
+    ),
+    "seed_oss": (
+        "seedoss_reasoning_parser",
+        "SeedOSSReasoningParser",
+    ),
+    "step3": (
+        "step3_reasoning_parser",
+        "Step3ReasoningParser",
+    ),
+}
+
+
+def register_lazy_reasoning_parsers():
+    for name, (file_name, class_name) in _REASONING_PARSERS_TO_REGISTER.items():
+        module_path = f"vllm.reasoning.{file_name}"
+        ReasoningParserManager.register_lazy_module(name, module_path, class_name)
+
+
+register_lazy_reasoning_parsers()
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index ebd660ca5a84d..d26e4ffc9c163 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
 import os
 from abc import abstractmethod
 from collections.abc import Callable, Sequence
@@ -75,7 +76,7 @@ class ReasoningParser:
         """
 
     @abstractmethod
-    def extract_reasoning_content(
+    def extract_reasoning(
         self,
         model_output: str,
         request: ChatCompletionRequest | ResponsesRequest,
@@ -99,7 +100,7 @@ class ReasoningParser:
         """
 
     @abstractmethod
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -129,50 +130,117 @@ class ReasoningParser:
 
 
 class ReasoningParserManager:
-    reasoning_parsers: dict[str, type] = {}
+    """
+    Central registry for ReasoningParser implementations.
+
+    Supports two registration modes:
+      - Eager registration via `register_module`
+      - Lazy registration via `register_lazy_module`
+
+    Each reasoning parser must inherit from `ReasoningParser`.
+    """
+
+    reasoning_parsers: dict[str, type[ReasoningParser]] = {}
+    lazy_parsers: dict[str, tuple[str, str]] = {}  # name -> (module_path, class_name)
 
     @classmethod
-    def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
+    def get_reasoning_parser(cls, name: str) -> type[ReasoningParser]:
         """
-        Get reasoning parser by name which is registered by `register_module`.
+        Retrieve a registered or lazily registered ReasoningParser class.
 
-        Raise a KeyError exception if the name is not registered.
+        If the parser is lazily registered, it will be imported and cached
+        on first access.
+
+        Raises:
+            KeyError: if no parser is found under the given name.
         """
         if name in cls.reasoning_parsers:
             return cls.reasoning_parsers[name]
 
-        raise KeyError(f"reasoning helper: '{name}' not found in reasoning_parsers")
+        if name in cls.lazy_parsers:
+            return cls._load_lazy_parser(name)
+
+        raise KeyError(f"Reasoning parser '{name}' not found.")
+
+    @classmethod
+    def list_registered(cls) -> list[str]:
+        """Return names of all eagerly and lazily registered reasoning parsers."""
+        return sorted(set(cls.reasoning_parsers.keys()) | set(cls.lazy_parsers.keys()))
+
+    @classmethod
+    def _load_lazy_parser(cls, name: str) -> type[ReasoningParser]:
+        """Import and register a lazily loaded reasoning parser."""
+        module_path, class_name = cls.lazy_parsers[name]
+        try:
+            mod = importlib.import_module(module_path)
+            parser_cls = getattr(mod, class_name)
+            if not issubclass(parser_cls, ReasoningParser):
+                raise TypeError(
+                    f"{class_name} in {module_path} is not a ReasoningParser subclass."
+                )
+
+            cls.reasoning_parsers[name] = parser_cls  # cache
+            return parser_cls
+        except Exception as e:
+            logger.exception(
+                "Failed to import lazy reasoning parser '%s' from %s: %s",
+                name,
+                module_path,
+                e,
+            )
+            raise
 
     @classmethod
     def _register_module(
         cls,
-        module: type,
+        module: type[ReasoningParser],
         module_name: str | list[str] | None = None,
         force: bool = True,
     ) -> None:
+        """Register a ReasoningParser class immediately."""
         if not issubclass(module, ReasoningParser):
             raise TypeError(
                 f"module must be subclass of ReasoningParser, but got {type(module)}"
             )
+
         if module_name is None:
-            module_name = module.__name__
-        if isinstance(module_name, str):
-            module_name = [module_name]
-        for name in module_name:
+            module_names = [module.__name__]
+        elif isinstance(module_name, str):
+            module_names = [module_name]
+        elif is_list_of(module_name, str):
+            module_names = module_name
+        else:
+            raise TypeError("module_name must be str, list[str], or None.")
+
+        for name in module_names:
             if not force and name in cls.reasoning_parsers:
-                existed_module = cls.reasoning_parsers[name]
-                raise KeyError(
-                    f"{name} is already registered at {existed_module.__module__}"
-                )
+                existed = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered at {existed.__module__}")
             cls.reasoning_parsers[name] = module
 
+    @classmethod
+    def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
+        """
+        Register a lazy module mapping for delayed import.
+
+        Example:
+            ReasoningParserManager.register_lazy_module(
+                name="qwen3",
+                module_path="vllm.reasoning.parsers.qwen3_reasoning_parser",
+                class_name="Qwen3ReasoningParser",
+            )
+        """
+        cls.lazy_parsers[name] = (module_path, class_name)
+
     @classmethod
     def register_module(
         cls,
         name: str | list[str] | None = None,
         force: bool = True,
-        module: type | None = None,
-    ) -> type | Callable:
+        module: type[ReasoningParser] | None = None,
+    ) -> (
+        type[ReasoningParser] | Callable[[type[ReasoningParser]], type[ReasoningParser]]
+    ):
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not
@@ -181,24 +249,29 @@ class ReasoningParserManager:
         if not isinstance(force, bool):
             raise TypeError(f"force must be a boolean, but got {type(force)}")
 
-        # raise the error ahead of time
-        if not (name is None or isinstance(name, str) or is_list_of(name, str)):
-            raise TypeError(
-                "name must be None, an instance of str, or a sequence of str, "
-                f"but got {type(name)}"
-            )
-
-        # use it as a normal method: x.register_module(module=SomeClass)
+        # Immediate registration (explicit call)
         if module is not None:
             cls._register_module(module=module, module_name=name, force=force)
             return module
 
-        # use it as a decorator: @x.register_module()
-        def _register(module):
-            cls._register_module(module=module, module_name=name, force=force)
-            return module
+        # Decorator usage
+        def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]:
+            module_path = obj.__module__
+            class_name = obj.__name__
 
-        return _register
+            if isinstance(name, str):
+                names = [name]
+            elif is_list_of(name, str):
+                names = name
+            else:
+                names = [class_name]
+
+            for n in names:
+                cls.lazy_parsers[n] = (module_path, class_name)
+
+            return obj
+
+        return _decorator
 
     @classmethod
     def import_reasoning_parser(cls, plugin_path: str) -> None:
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 621a73b2a59f0..0268947732726 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -3,15 +3,21 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    DeltaMessage,
-    ResponsesRequest,
-)
+from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.protocol import (
+        ChatCompletionRequest,
+        ResponsesRequest,
+    )
+else:
+    ChatCompletionRequest = Any
+    ResponsesRequest = Any
+
 
 class BaseThinkingReasoningParser(ReasoningParser):
     """
@@ -70,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
         else:
             return input_ids[input_ids.index(self.end_token_id) + 1 :]
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -97,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser):
                 # start token in previous, end token in delta,
                 # extract reasoning content
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[:end_index]
+                reasoning = delta_text[:end_index]
                 content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
-                    content=content if content else None,
+                    reasoning=reasoning, content=content if content else None
                 )
             elif self.end_token_id in previous_token_ids:
                 # start token in previous, end token in previous,
@@ -110,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser):
             else:
                 # start token in previous, no end token in previous or delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         elif self.start_token_id in delta_token_ids:
             if self.end_token_id in delta_token_ids:
                 # start token in delta, end token in delta,
                 # extract reasoning content
                 start_index = delta_text.find(self.start_token)
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[
-                    start_index + len(self.start_token) : end_index
-                ]
+                reasoning = delta_text[start_index + len(self.start_token) : end_index]
                 content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
-                    content=content if content else None,
+                    reasoning=reasoning, content=content if content else None
                 )
             else:
                 # start token in delta, no end token in delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         else:
             # not find thinking start token
             return DeltaMessage(content=delta_text)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
     ) -> tuple[str | None, str | None]:
         """
@@ -154,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
         if self.end_token not in model_output:
             return model_output, None
         else:
-            reasoning_content, _, content = model_output.partition(self.end_token)
+            reasoning, _, content = model_output.partition(self.end_token)
             # If generation stops right after end-of-think, return null content
             final_content = content or None
-            return reasoning_content, final_content
+            return reasoning, final_content
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index d5200145ea03e..a91c8ceeb6255 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -4,11 +4,9 @@
 from collections.abc import Sequence
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("deepseek_r1")
 class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for DeepSeek R1 model.
@@ -27,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -36,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
     ) -> DeltaMessage | None:
-        ret = super().extract_reasoning_content_streaming(
+        ret = super().extract_reasoning_streaming(
             previous_text,
             current_text,
             delta_text,
@@ -53,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
                 # end token in delta with more tokens,
                 # extract reasoning content and content
                 end_index = delta_text.find(self.end_token)
-                reasoning_content = delta_text[:end_index]
+                reasoning = delta_text[:end_index]
                 content = delta_text[end_index + len(self.end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content=content if content else None,
                 )
             elif self.end_token_id in previous_token_ids:
@@ -64,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
                 return DeltaMessage(content=delta_text)
             else:
                 # no end token in previous or delta, reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
 
         return ret
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index 7116f90a1ac0a..afdf73262aca0 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -7,18 +7,14 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import (
-    DeepSeekR1ReasoningParser,
-    ReasoningParser,
-    ReasoningParserManager,
-)
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 
 from .identity_reasoning_parser import IdentityReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("deepseek_v3")
 class DeepSeekV3ReasoningParser(ReasoningParser):
     """
     V3 parser that delegates to either DeepSeekR1ReasoningParser or
@@ -42,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return self._parser.extract_content_ids(input_ids)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
-        return self._parser.extract_reasoning_content(model_output, request)
+        return self._parser.extract_reasoning(model_output, request)
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -56,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
     ) -> DeltaMessage | None:
-        return self._parser.extract_reasoning_content_streaming(
+        return self._parser.extract_reasoning_streaming(
             previous_text,
             current_text,
             delta_text,
diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py
index f9d4a30398cfd..3cdbf14858ec2 100644
--- a/vllm/reasoning/ernie45_reasoning_parser.py
+++ b/vllm/reasoning/ernie45_reasoning_parser.py
@@ -7,13 +7,11 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParserManager
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("ernie45")
 class Ernie45ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for Ernie45 thinking model.
@@ -36,8 +34,8 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
-    def __init__(self, tokenizer: PreTrainedTokenizerBase):
-        super().__init__(tokenizer)
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -59,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
                 "tokens in the tokenizer!"
             )
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -75,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
         The Ernie45 thinking model ouput format is
             abc\n</think>\n\n<response>\ndef\n</response>\n
         or  abc\n</think>\ndef
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'def' goes to content
         """
         # Skip single special tokens
@@ -96,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
             # </think> in delta with more tokens,
             # extract reasoning content and content
             think_end_index = delta_text.find(self.end_token)
-            reasoning_content = delta_text[:think_end_index]
+            reasoning = delta_text[:think_end_index]
             content = delta_text[think_end_index + len(self.end_token) :]
             content = content.lstrip("\n")
             response_start_idx = content.find(self.response_start_token)
@@ -106,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
             if response_end_idx != -1:
                 content = content[:response_end_idx]
             return DeltaMessage(
-                reasoning_content=reasoning_content,
+                reasoning=reasoning,
                 content=content if content else None,
             )
         elif self.end_token_id in previous_token_ids:
@@ -140,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
             return DeltaMessage(content=content if content else None)
         else:
             # no </think> in previous or delta, reasoning content continues
-            return DeltaMessage(reasoning_content=delta_text)
+            return DeltaMessage(reasoning=delta_text)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
         """
@@ -150,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
         The Ernie45 thinking model ouput format is
             abc\n</think>\n\n\n<response>\ndef\n</response>\n
         or  abc\n</think>\ndef
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'def' goes to content
         Returns:
             tuple[Optional[str], Optional[str]]: reasoning content and content
         """
-        reasoning_content, content = super().extract_reasoning_content(
-            model_output, request
-        )
+        reasoning, content = super().extract_reasoning(model_output, request)
         if content:
             start_idx = content.find(self.response_start_token)
             end_idx = content.rfind(self.response_end_token)
@@ -166,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
                 content = content[start_idx + len(self.response_start_token) : end_idx]
         final_content = content or None
 
-        return reasoning_content, final_content
+        return reasoning, final_content
diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py
index 09cd43c1d555e..1871adcd43210 100644
--- a/vllm/reasoning/glm4_moe_reasoning_parser.py
+++ b/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -7,12 +7,11 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("glm45")
 class Glm4MoeModelReasoningParser(ReasoningParser):
     """
     Reasoning parser for the Glm4MoeModel model.
@@ -71,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         else:
             return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -85,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         Handles streaming output where previous + delta = current.
         Uses token IDs for faster processing.
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
         """
         # Skip single special tokens
@@ -99,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
                 # <think> in previous, </think> in delta,
                 # extract reasoning content
                 end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[:end_index]
+                reasoning = delta_text[:end_index]
                 content = delta_text[end_index + len(self.think_end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content=content if content else None,
                 )
             elif self.think_end_token_id in previous_token_ids:
@@ -112,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
             else:
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         elif self.think_start_token_id in delta_token_ids:
             if self.think_end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
                 start_index = delta_text.find(self.think_start_token)
                 end_index = delta_text.find(self.think_end_token)
-                reasoning_content = delta_text[
+                reasoning = delta_text[
                     start_index + len(self.think_start_token) : end_index
                 ]
                 content = delta_text[end_index + len(self.think_end_token) :]
                 return DeltaMessage(
-                    reasoning_content=reasoning_content,
+                    reasoning=reasoning,
                     content=content if content else None,
                 )
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
-                return DeltaMessage(reasoning_content=delta_text)
+                return DeltaMessage(reasoning=delta_text)
         else:
             # thinking is disabled, just content
             return DeltaMessage(content=delta_text)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
 
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
 
         Returns:
@@ -166,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
             return None, model_output
 
         # Extract reasoning content from the model output.
-        reasoning_content, _, content = model_output.partition(self.think_end_token)
+        reasoning, _, content = model_output.partition(self.think_end_token)
 
         final_content = content or None
-        return reasoning_content, final_content
+        return reasoning, final_content
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index e6766ddcbc687..0c1b54d0bd359 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -9,7 +9,7 @@ from vllm.entrypoints.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
@@ -57,7 +57,6 @@ def tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list: list[str])
     return new_tag
 
 
-@ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
     """
     Reasoning parser for GptOss model.
@@ -68,18 +67,35 @@ class GptOssReasoningParser(ReasoningParser):
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
-        self.reasoning_end_token_ids = self.model_tokenizer.encode(
-            "<|start|>assistant<|channel|>final<|message|>"
+        # The model can output some special tokens between "final" and "<|message|>"
+        # So we need to look for both sequences to determine the end of reasoning.
+        self.reasoning_end_token_ids_prefix = self.model_tokenizer.encode(
+            "<|channel|>final"
         )
+        self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
+        self.reasoning_max_num_between_tokens = 20
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        end_token_ids = self.reasoning_end_token_ids
-        assert len(end_token_ids) > 0, "reasoning_end_token_ids is empty"
+        end_token_ids_prefix = self.reasoning_end_token_ids_prefix
+        end_token_ids_suffix = self.reasoning_end_token_ids_suffix
+        assert len(end_token_ids_prefix) > 0, "reasoning_end_token_ids_prefix is empty"
+        assert len(end_token_ids_suffix) > 0, "reasoning_end_token_ids_suffix is empty"
         # Check if the end sequence is present in the input_ids.
         # We search from the end of input_ids to find the last match.
-        for i in range(len(input_ids) - len(end_token_ids), -1, -1):
-            if input_ids[i : i + len(end_token_ids)] == end_token_ids:
-                return True
+        for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1):
+            if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix:
+                # We have found the prefix, now we look for the suffix after the prefix.
+                suffix_start = i + len(end_token_ids_prefix)
+                for j in range(
+                    suffix_start, len(input_ids) - len(end_token_ids_suffix) + 1
+                ):
+                    if j - suffix_start >= self.reasoning_max_num_between_tokens:
+                        break
+                    if (
+                        input_ids[j : j + len(end_token_ids_suffix)]
+                        == end_token_ids_suffix
+                    ):
+                        return True
         return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
@@ -88,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser):
             return []
         return self.model_tokenizer.encode(content)
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -115,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser):
                 content_delta = cur_content
         if reasoning_delta is None and content_delta is None:
             return None
-        return DeltaMessage(reasoning_content=reasoning_delta, content=content_delta)
+        return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self,
         model_output: str,
         request: ChatCompletionRequest,
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 44391f8ad6351..484045d66a3c9 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -8,12 +8,11 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("granite")
 class GraniteReasoningParser(ReasoningParser):
     """
     Reasoning parser for IBM Granite.
@@ -50,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser):
             len(think_start) for think_start in self.valid_think_starts
         )
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
@@ -68,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser):
         re_match = self.reasoning_regex.findall(model_output)
         if not re_match:
             return None, model_output
-        reasoning_content, response_content = re_match[0]
+        reasoning, response_content = re_match[0]
         if not response_content:
-            return reasoning_content, None
-        return reasoning_content, response_content
+            return reasoning, None
+        return reasoning, response_content
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -108,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser):
             Union[DeltaMessage, None]
                 DeltaMessage with either reasoning content or content, or None.
         """
-        reasoning_content, resp_seq_len, content = self._get_content_sections(
-            current_text
-        )
+        reasoning, resp_seq_len, content = self._get_content_sections(current_text)
         # Either we haven't finished the start of the reasoning sequence,
         # or the model is generating something unexpected.
-        if not reasoning_content:
+        if not reasoning:
             delta_message = self._get_delta_message_with_no_reasoning_bounds(
                 current_text, delta_text
             )
@@ -121,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser):
         # the start of response sequence.
         elif not content:
             delta_message = self._get_delta_message_with_no_response_bounds(
-                current_text, reasoning_content, delta_text
+                current_text, reasoning, delta_text
             )
         # We've finished both the start of reasoning and start of response seq.
         else:
             # This should never happen since we matched on the response
             assert resp_seq_len is not None
             delta_message = self._get_delta_message_with_both_bounds(
-                delta_text, reasoning_content, content, current_text, resp_seq_len
+                delta_text, reasoning, content, current_text, resp_seq_len
             )
-        if not delta_message.content and not delta_message.reasoning_content:
+        if not delta_message.content and not delta_message.reasoning:
             return None
         return delta_message
 
@@ -186,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser):
         # message and append everything to content in the future.
         if was_substr and not is_substr:
             return DeltaMessage(
-                reasoning_content=None,
+                reasoning=None,
                 content=current_text,
             )
         if is_substr:
             # Might still be in the special token sequence; return nothing
-            return DeltaMessage(reasoning_content=None, content=None)
+            return DeltaMessage(reasoning=None, content=None)
         # Otherwise the sequence has already been broken and we already
         # corrected; just return the delta text as normal content.
-        return DeltaMessage(reasoning_content=None, content=delta_text)
+        return DeltaMessage(reasoning=None, content=delta_text)
 
     def _get_delta_message_with_no_response_bounds(
         self,
         current_text: str,
-        reasoning_content: str,
+        reasoning: str,
         delta_text: str,
     ) -> DeltaMessage:
         """Parse the delta message when the current text has both reasoning
@@ -209,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             current_text (str): The full previous + delta text.
-            reasoning_content (str): reasoning content from current_text.
+            reasoning (str): reasoning content from current_text.
             delta_text (str): Text to consider and parse content from.
 
         Returns:
@@ -223,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser):
             current_text.endswith(response_start)
             for response_start in self.valid_response_starts
         )
-        if reasoning_content is None or ends_with_start_response_seq:
-            return DeltaMessage(reasoning_content=None, content=None)
+        if reasoning is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning=None, content=None)
 
         # Consider previous / current text only within context of the reasoning
-        previous_text = reasoning_content[: -len(delta_text)]
-        current_text = reasoning_content
+        previous_text = reasoning[: -len(delta_text)]
+        current_text = reasoning
 
         # We need to be careful about adding unfinished response sequences;
         # Find the place at which we MIGHT be starting a response sequence
@@ -254,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser):
 
         # Delta only contains potential continued response sequence text.
         if delta_continues_substr:
-            return DeltaMessage(reasoning_content=None, content=None)
+            return DeltaMessage(reasoning=None, content=None)
 
         if not prev_was_substr:
             # Delta may be starting a new response seq but has other text too.
             if delta_new_substr:
-                return DeltaMessage(
-                    reasoning_content=delta_text[:delta_idx], content=None
-                )
+                return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
             # Normal case for most reasoning text (no potential special seqs).
-            return DeltaMessage(reasoning_content=delta_text, content=None)
+            return DeltaMessage(reasoning=delta_text, content=None)
         # The substring that previously seemed to be a potential response
         # seq wasn't one; we need to add the content to the delta message,
         # and also slice off the potential response sequence
         elif delta_new_substr:
-            reasoning_content = previous_text[prev_idx:] + delta_text[:delta_idx]
-            return DeltaMessage(reasoning_content=reasoning_content, content=None)
+            reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning=reasoning, content=None)
         # No new substring yet, and we broke our old one; take the whole delta
         return DeltaMessage(
-            reasoning_content=previous_text[prev_idx:] + delta_text,
+            reasoning=previous_text[prev_idx:] + delta_text,
             content=None,
         )
 
     def _get_delta_message_with_both_bounds(
         self,
         delta_text: str,
-        reasoning_content: str,
+        reasoning: str,
         response_content: str,
         current_text: str,
         response_seq_len: int,
@@ -289,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             delta_text: Text to consider and parse content from.
-            reasoning_content: reasoning content from current_text.
+            reasoning: reasoning content from current_text.
             response_content: response content from current_text.
             current_text: The full previous + delta text.
             response_seq_len: Len of the complete response sequence used.
@@ -302,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser):
         reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
 
         if reasoning_end_idx < 0:
-            delta_reasoning_content = None
+            delta_reasoning = None
         else:
             # Get the starting offset
-            start_reasoning_content_idx = (
-                len(reasoning_content) + response_seq_len + len(response_content) - 1
+            start_reasoning_idx = (
+                len(reasoning) + response_seq_len + len(response_content) - 1
             )
             delta_offset = len(current_text) - len(delta_text)
-            start_offset = start_reasoning_content_idx - delta_offset
+            start_offset = start_reasoning_idx - delta_offset
             if start_offset < 0:
                 start_offset = 0
-            delta_reasoning_content = delta_text[start_offset:reasoning_end_idx]
+            delta_reasoning = delta_text[start_offset:reasoning_end_idx]
 
         return DeltaMessage(
-            reasoning_content=delta_reasoning_content,
+            reasoning=delta_reasoning,
             content=delta_content,
         )
 
@@ -334,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser):
             (if there is one) and the non-reasoning content.
         """
         current_chunk_start = 0
-        start_reasoning_content = None
+        start_reasoning = None
         parsed_content = False
         delimiter_idxs = [
             idx
@@ -345,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser):
         for current_chunk_end in delimiter_idxs:
             current_chunk = current_text[current_chunk_start:current_chunk_end]
             # Check to see if the start of reasoning seq if complete
-            if start_reasoning_content is None:
+            if start_reasoning is None:
                 for think_start in self.valid_think_starts:
                     if current_chunk == think_start[:-1]:
-                        start_reasoning_content = current_chunk_end + 1
+                        start_reasoning = current_chunk_end + 1
                         current_chunk_start = current_chunk_end + 1
                         break
 
@@ -358,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser):
                     if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
                         # Mark end of reasoning and start response content
                         # after the start of response sequence.
-                        end_reasoning_content = current_chunk_end - len(response_start)
-                        reasoning_content = current_text[
-                            start_reasoning_content:end_reasoning_content
-                        ]
+                        end_reasoning = current_chunk_end - len(response_start)
+                        reasoning = current_text[start_reasoning:end_reasoning]
                         response_content = current_text[current_chunk_end + 1 :]
-                        return reasoning_content, len(response_start), response_content
+                        return reasoning, len(response_start), response_content
 
-        if start_reasoning_content and not parsed_content:
-            return current_text[start_reasoning_content:], None, None
+        if start_reasoning and not parsed_content:
+            return current_text[start_reasoning:], None, None
         return None, None, None
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index e5cf6f399740f..f297454f57ec9 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -8,12 +8,11 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("hunyuan_a13b")
 class HunyuanA13BReasoningParser(ReasoningParser):
     """
     Reasoning parser for Hunyuan A13B Model
@@ -87,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
         # this id is not part of content, so just return [] here.
         return []
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
@@ -105,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
         re_match = self.full_match_reasoning_regex.findall(model_output)
         if re_match:
-            reasoning_content, response_content = re_match[0]
-            if len(reasoning_content) == 0:
-                reasoning_content = None
+            reasoning, response_content = re_match[0]
+            if len(reasoning) == 0:
+                reasoning = None
             if len(response_content) == 0:
                 response_content = None
-            return reasoning_content, response_content
+            return reasoning, response_content
 
         fallback_regex = self.half_match_reasoning_regex
         fallback_match = fallback_regex.findall(model_output)
         if fallback_match:
-            reasoning_content, response_content = fallback_match[0]
+            reasoning, response_content = fallback_match[0]
 
             if response_content.endswith(self.response_end_expr):
                 response_content = response_content[: -len(self.response_end_expr)]
 
-            if len(reasoning_content) == 0:
-                reasoning_content = None
+            if len(reasoning) == 0:
+                reasoning = None
             if len(response_content) == 0:
                 response_content = None
 
-            return reasoning_content, response_content
+            return reasoning, response_content
 
         return None, model_output
 
@@ -141,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
                 sub_idx += 1
         return sub_idx == len(subsequence)
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -224,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
                 # Return content based on current state
                 if self.current_state == "think":
-                    return DeltaMessage(
-                        reasoning_content=buffered_content, content=None
-                    )
+                    return DeltaMessage(reasoning=buffered_content, content=None)
                 else:
-                    return DeltaMessage(
-                        reasoning_content=None, content=buffered_content
-                    )
+                    return DeltaMessage(reasoning=None, content=buffered_content)
             else:
                 # No buffered content, send normally
                 if self.current_state == "think":
-                    return DeltaMessage(reasoning_content=delta_text, content=None)
+                    return DeltaMessage(reasoning=delta_text, content=None)
                 else:
-                    return DeltaMessage(reasoning_content=None, content=delta_text)
+                    return DeltaMessage(reasoning=None, content=delta_text)
 
         # If no content to send in this delta
         return None
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
index f1d17a71be338..e92f8add0391a 100644
--- a/vllm/reasoning/identity_reasoning_parser.py
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser):
         # Identity: return all tokens as content
         return input_ids
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser):
             return DeltaMessage(content=delta_text)
         return None
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
-        # No reasoning separation: return None for reasoning_content,
+        # No reasoning separation: return None for reasoning,
         # and full model_output as content
         return None, model_output
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 0d4f6cc270a1c..30f5f2f88caf7 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -9,14 +9,13 @@ from vllm.entrypoints.openai.protocol import (
     ResponsesRequest,
 )
 from vllm.logger import init_logger
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("minimax_m2")
 class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for MiniMax M2 model.
@@ -33,7 +32,6 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
         return "</think>"
 
 
-@ReasoningParserManager.register_module("minimax_m2_append_think")
 class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     """
     Reasoning parser for MiniMax M2 model.
@@ -50,7 +48,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return input_ids
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -63,7 +61,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
             delta_text = "<think>" + delta_text
         return DeltaMessage(content=delta_text)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
     ) -> tuple[str | None, str | None]:
         return None, "<think>" + model_output
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index 5658c372a264c..af6d179bf6d01 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -4,14 +4,13 @@
 from functools import cached_property
 
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("mistral")
 class MistralReasoningParser(DeepSeekR1ReasoningParser):
     """
     Reasoning parser for Mistral models.
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index b6c26899a1148..7149f8c4123b3 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -17,7 +17,7 @@ from vllm.entrypoints.openai.protocol import (
     ResponsesRequest,
 )
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
@@ -115,7 +115,7 @@ class Olmo3ReasoningBuffer:
             if end_think_idx > 0:
                 # this covers the case there's content before
                 # the end of the reasoning block
-                return DeltaMessage(reasoning_content=pretext)
+                return DeltaMessage(reasoning=pretext)
 
         if self.state == Olmo3ReasoningState.REASONING:
             # we are inside reasoning block, return and empty
@@ -124,7 +124,7 @@ class Olmo3ReasoningBuffer:
                 text_buffer,
                 self.buffer,
             ) = self.buffer, ""
-            return DeltaMessage(reasoning_content=text_buffer)
+            return DeltaMessage(reasoning=text_buffer)
 
         if self.state == Olmo3ReasoningState.CONTENT:
             # we are outside reasoning block, return and empty
@@ -192,7 +192,6 @@ class Olmo3ReasoningBuffer:
         return delta_message
 
 
-@ReasoningParserManager.register_module("olmo3")
 class Olmo3ReasoningParser(ReasoningParser):
     """
     Reasoning parser for Olmo 3 model
@@ -251,7 +250,7 @@ class Olmo3ReasoningParser(ReasoningParser):
         # this id is not part of content, so just return [] here.
         return []
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self,
         model_output: str,
         request: ChatCompletionRequest | ResponsesRequest,
@@ -272,14 +271,14 @@ class Olmo3ReasoningParser(ReasoningParser):
 
         re_match = self.reasoning_regex.match(model_output)
         if re_match:
-            reasoning_content = re_match.group("reasoning") or None
+            reasoning = re_match.group("reasoning") or None
             content = re_match.group("content") or None
-            return reasoning_content, content
+            return reasoning, content
 
         # no reasoning content
         return None, model_output
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 2ec06720719da..ef7762bf0af59 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -3,11 +3,9 @@
 
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("qwen3")
 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for the Qwen3 model.
@@ -29,7 +27,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
         """The token that ends reasoning content."""
         return "</think>"
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
     ) -> tuple[str | None, str | None]:
         """
@@ -39,7 +37,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
         to be present, unlike other models that work with just the end token.
 
         For text <think>abc</think>xyz:
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
 
         Returns:
@@ -63,7 +61,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
             return None, model_output
 
         # Extract reasoning content from the model output.
-        reasoning_content, _, content = model_output.partition(self.end_token)
+        reasoning, _, content = model_output.partition(self.end_token)
 
         final_content = content or None
-        return reasoning_content, final_content
+        return reasoning, final_content
diff --git a/vllm/reasoning/seedoss_reasoning_parser.py b/vllm/reasoning/seedoss_reasoning_parser.py
index 72f8dc54f1b37..d3d4d8ec0749e 100644
--- a/vllm/reasoning/seedoss_reasoning_parser.py
+++ b/vllm/reasoning/seedoss_reasoning_parser.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
-@ReasoningParserManager.register_module("seed_oss")
 class SeedOSSReasoningParser(BaseThinkingReasoningParser):
     """
     Reasoning parser for SeedOSS model.
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
index ae066d96f2505..f635758a92c0d 100644
--- a/vllm/reasoning/step3_reasoning_parser.py
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -8,12 +8,11 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.logger import init_logger
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("step3")
 class Step3ReasoningParser(ReasoningParser):
     """
     Reasoning parser for Step3 model.
@@ -41,7 +40,7 @@ class Step3ReasoningParser(ReasoningParser):
                 "token in the tokenizer!"
             )
 
-    def extract_reasoning_content_streaming(
+    def extract_reasoning_streaming(
         self,
         previous_text: str,
         current_text: str,
@@ -55,7 +54,7 @@ class Step3ReasoningParser(ReasoningParser):
         Handles streaming output where previous + delta = current.
         Uses token IDs for faster processing.
         For text "abc</think>xyz":
-        - 'abc' goes to reasoning_content
+        - 'abc' goes to reasoning
         - 'xyz' goes to content
         """
         # Skip single special token
@@ -65,10 +64,10 @@ class Step3ReasoningParser(ReasoningParser):
         if self.think_end_token_id in delta_token_ids:
             # </think> in delta, extract reasoning content and remaining content
             end_index = delta_text.find(self.think_end_token)
-            reasoning_content = delta_text[:end_index]
+            reasoning = delta_text[:end_index]
             content = delta_text[end_index + len(self.think_end_token) :]
             return DeltaMessage(
-                reasoning_content=reasoning_content,
+                reasoning=reasoning,
                 content=content if content else None,
             )
         elif self.think_end_token_id in previous_token_ids:
@@ -76,9 +75,9 @@ class Step3ReasoningParser(ReasoningParser):
             return DeltaMessage(content=delta_text)
         else:
             # No </think> seen yet, everything is reasoning
-            return DeltaMessage(reasoning_content=delta_text)
+            return DeltaMessage(reasoning=delta_text)
 
-    def extract_reasoning_content(
+    def extract_reasoning(
         self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[str | None, str | None]:
         # Check if the model output contains the </think> token
@@ -88,7 +87,7 @@ class Step3ReasoningParser(ReasoningParser):
         else:
             # Find the first occurrence of </think>
             end_index = model_output.find(self.think_end_token)
-            reasoning_content = model_output[:end_index]
+            reasoning = model_output[:end_index]
 
             # Content after </think> token
             content = model_output[end_index + len(self.think_end_token) :]
@@ -96,7 +95,7 @@ class Step3ReasoningParser(ReasoningParser):
             if len(content) == 0:
                 content = None
 
-            return reasoning_content, content
+            return reasoning, content
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index 3bdbe1d0a67b6..fe84b6c152eef 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -40,6 +40,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "qwen": _get_qwen_chat_template_fallback,
     "siglip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "siglip2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
 }
 
 
diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
index 661ebd1cf5c17..c73ae96f0c1d5 100644
--- a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
@@ -30,18 +30,18 @@
         {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
     {%- elif message.role == "assistant" %}
         {%- set content = message.content %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-            {%- set reasoning_content = message.reasoning_content %}
+        {%- set reasoning = '' %}
+        {%- if message.reasoning is defined and message.reasoning is not none %}
+            {%- set reasoning = message.reasoning %}
         {%- else %}
             {%- if '</think>' in message.content %}
                 {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set reasoning = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
             {%- endif %}
         {%- endif %}
         {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- if loop.last or (not loop.last and reasoning) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
             {%- else %}
                 {{- '<|im_start|>' + message.role + '\n' + content }}
             {%- endif %}
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index b1f4e3e2a9831..14cae2b168e19 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -24,7 +24,7 @@ from huggingface_hub.utils import (
     RepositoryNotFoundError,
     RevisionNotFoundError,
 )
-from transformers import GenerationConfig, PretrainedConfig
+from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -68,20 +68,21 @@ def _get_hf_token() -> str | None:
 
 class LazyConfigDict(dict):
     def __getitem__(self, key):
+        if isinstance(value := super().__getitem__(key), type):
+            return value
+
         import vllm.transformers_utils.configs as configs
 
-        return getattr(configs, super().__getitem__(key))
+        return getattr(configs, value)
 
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
-    deepseek_v3="DeepseekV3Config",
-    deepseek_v32="DeepseekV3Config",
+    deepseek_v32=DeepseekV3Config,
     flex_olmo="FlexOlmoConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
-    Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
     RefinedWebModel="RWConfig",  # For tiiuae/falcon-7b(-instruct)
     jais="JAISConfig",
@@ -106,6 +107,7 @@ _CONFIG_ATTRS_MAPPING: dict[str, str] = {
 
 _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
     "internvl_chat": {"has_no_defaults_at_init": True},
+    "Llama_Nemotron_Nano_VL": {"attn_implementation": "eager"},
     "NVLM_D": {"has_no_defaults_at_init": True},
 }
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 663a8e44d71dd..ac612b255143c 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -8,7 +8,6 @@ Model configs may be defined in this directory for the following reasons:
 """
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
@@ -28,7 +27,6 @@ from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.olmo3 import Olmo3Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
@@ -44,7 +42,6 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 __all__ = [
     "ChatGLMConfig",
     "DeepseekVLV2Config",
-    "DeepseekV3Config",
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",
@@ -59,7 +56,6 @@ __all__ = [
     "KimiVLConfig",
     "NemotronConfig",
     "NemotronHConfig",
-    "Nemotron_Nano_VL_Config",
     "Olmo3Config",
     "OvisConfig",
     "RadioConfig",
diff --git a/vllm/transformers_utils/configs/deepseek_v3.py b/vllm/transformers_utils/configs/deepseek_v3.py
deleted file mode 100644
index 91fbed79dd021..0000000000000
--- a/vllm/transformers_utils/configs/deepseek_v3.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class DeepseekV3Config(PretrainedConfig):
-    model_type = "deepseek_v3"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=129280,
-        hidden_size=7168,
-        intermediate_size=18432,
-        moe_intermediate_size=2048,
-        num_hidden_layers=61,
-        num_nextn_predict_layers=1,
-        num_attention_heads=128,
-        num_key_value_heads=128,
-        n_shared_experts=1,
-        n_routed_experts=256,
-        ep_size=1,
-        routed_scaling_factor=2.5,
-        kv_lora_rank=512,
-        q_lora_rank=1536,
-        qk_rope_head_dim=64,
-        v_head_dim=128,
-        qk_nope_head_dim=128,
-        topk_method="noaux_tc",
-        n_group=8,
-        topk_group=4,
-        num_experts_per_tok=8,
-        moe_layer_freq=1,
-        first_k_dense_replace=3,
-        norm_topk_prob=True,
-        scoring_func="sigmoid",
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=0,
-        eos_token_id=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_nextn_predict_layers = num_nextn_predict_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_shared_experts = n_shared_experts
-        self.n_routed_experts = n_routed_experts
-        self.ep_size = ep_size
-        self.routed_scaling_factor = routed_scaling_factor
-        self.kv_lora_rank = kv_lora_rank
-        self.q_lora_rank = q_lora_rank
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.topk_method = topk_method
-        self.n_group = n_group
-        self.topk_group = topk_group
-        self.num_experts_per_tok = num_experts_per_tok
-        self.moe_layer_freq = moe_layer_freq
-        self.first_k_dense_replace = first_k_dense_replace
-        self.norm_topk_prob = norm_topk_prob
-        self.scoring_func = scoring_func
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 7abfe62298422..8b02a4ddd4bc7 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -3,7 +3,7 @@
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 
-from transformers.configuration_utils import PretrainedConfig
+from transformers import DeepseekV2Config, PretrainedConfig
 
 
 class VisionEncoderConfig(PretrainedConfig):
@@ -87,106 +87,6 @@ class MlpProjectorConfig(PretrainedConfig):
         super().__init__(**kwargs)
 
 
-class DeepseekV2Config(PretrainedConfig):
-    model_type = "deepseek_v2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        hidden_size=4096,
-        intermediate_size=11008,
-        moe_intermediate_size=1407,
-        num_hidden_layers=30,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        n_shared_experts=None,
-        n_routed_experts=None,
-        ep_size=1,
-        routed_scaling_factor=1.0,
-        kv_lora_rank=512,
-        q_lora_rank=1536,
-        qk_rope_head_dim=64,
-        v_head_dim=128,
-        qk_nope_head_dim=128,
-        topk_method="gready",
-        n_group=None,
-        topk_group=None,
-        num_experts_per_tok=None,
-        moe_layer_freq=1,
-        first_k_dense_replace=0,
-        norm_topk_prob=False,
-        scoring_func="softmax",
-        aux_loss_alpha=0.001,
-        seq_aux=True,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=100000,
-        eos_token_id=100001,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        use_mla=True,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_shared_experts = n_shared_experts
-        self.n_routed_experts = n_routed_experts
-        self.ep_size = ep_size
-        self.routed_scaling_factor = routed_scaling_factor
-        self.kv_lora_rank = kv_lora_rank
-        self.q_lora_rank = q_lora_rank
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.topk_method = topk_method
-        self.n_group = n_group
-        self.topk_group = topk_group
-        self.num_experts_per_tok = num_experts_per_tok
-        self.moe_layer_freq = moe_layer_freq
-        self.first_k_dense_replace = first_k_dense_replace
-        self.norm_topk_prob = norm_topk_prob
-        self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
-        self.seq_aux = seq_aux
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = float(rms_norm_eps)
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.use_mla = use_mla
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
 class DeepseekVLV2Config(PretrainedConfig):
     model_type = "deepseek_vl_v2"
     vision_config: VisionEncoderConfig
@@ -218,3 +118,9 @@ class DeepseekVLV2Config(PretrainedConfig):
         self.global_view_pos = global_view_pos
         self.candidate_resolutions = candidate_resolutions
         self.vocab_size = self.text_config.vocab_size
+
+        # update model_type for OCR model
+        if "DeepseekOCRForCausalLM" in (
+            self.architectures or kwargs.get("architectures", [])
+        ):
+            self.model_type = "deepseek_ocr"
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 4da877f9e81f5..f5dc9ddfbc575 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -3,9 +3,7 @@
 
 import os
 
-from transformers import AutoConfig, PretrainedConfig
-
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
 
 
 class EAGLEConfig(PretrainedConfig):
@@ -20,13 +18,7 @@ class EAGLEConfig(PretrainedConfig):
     ):
         model_config: PretrainedConfig | DeepseekV2Config | None
         if isinstance(model, dict):
-            archs = model.get("architectures", [])
-            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
-            if any(target_arch in archs for target_arch in target_archs):
-                # AutoConfig does not support DeepSeek MoE models yet
-                model_config = DeepseekV2Config(**model)
-            else:
-                model_config = AutoConfig.for_model(**model)
+            model_config = AutoConfig.for_model(**model)
         else:
             model_config = model
 
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index e8c19d0ec2ffe..6d992464cbe81 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 
+from transformers import DeepseekV2Config
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index d5bf79e01f954..c6f04febe37e1 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -24,6 +24,18 @@ def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig
     if bool(config_dict.get("yarn")):
         config_dict = _remap_mistral_yarn_args(config_dict)
 
+    if bool(config_dict.get("llama_4_scaling")):
+        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+        assert all(
+            [
+                key in config_dict["llama_4_scaling"]
+                for key in llama_4_scaling_config_keys
+            ]
+        ), (
+            "llama_4_scaling config should define the keys: "
+            f"{','.join(llama_4_scaling_config_keys)}"
+        )
+
     is_vision = (config_dict.get("multimodal") or {}).get(
         "vision_encoder_args"
     ) or config_dict.get("vision_encoder")
@@ -66,19 +78,24 @@ def _remap_mistral_vision_args(config: dict) -> dict:
 
 
 def _remap_mistral_yarn_args(config: dict) -> dict:
-    # Direct remaps: yarn.X -> rope_scaling.Y
-    # Source keys are from mistral.model.args.YarnArgs
-    _map = {
+    yarn_config_map = {
+        "factor": "factor",
+        "original_max_position_embeddings": "original_max_position_embeddings",
         "beta": "beta_fast",
         "alpha": "beta_slow",
+        "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
     config["rope_scaling"] = {
         "rope_type": "yarn",
-        "mscale_all_dim": 1,  # We hardcoded this to 1
-        **renamed_yarn_config,
+        "mscale_all_dim": 1,
     }
+    for old_name, new_name in yarn_config_map.items():
+        if old_name in yarn_config:
+            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
+
+    assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
+
     return config
 
 
diff --git a/vllm/transformers_utils/configs/nemotron_vl.py b/vllm/transformers_utils/configs/nemotron_vl.py
deleted file mode 100644
index 6f98fbafbed5f..0000000000000
--- a/vllm/transformers_utils/configs/nemotron_vl.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa: E501
-# Adapted from
-# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
-# --------------------------------------------------------
-# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
-#     LICENSE is in incl_licenses directory.
-# --------------------------------------------------------
-
-from transformers import LlamaConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
-
-
-class Nemotron_Nano_VL_Config(PretrainedConfig):
-    model_type = "Llama_Nemotron_Nano_VL"
-    is_composition = True
-
-    def __init__(
-        self,
-        vision_config=None,
-        llm_config=None,
-        force_image_size=None,
-        downsample_ratio=0.5,
-        template=None,
-        ps_version="v1",
-        image_tag_type="internvl",
-        projector_hidden_size=4096,
-        vit_hidden_size=1280,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        if vision_config is not None:
-            assert (
-                "auto_map" in vision_config
-                and "AutoConfig" in vision_config["auto_map"]
-            )
-            vision_auto_config = get_class_from_dynamic_module(
-                *vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
-            )
-            self.vision_config = vision_auto_config(**vision_config)
-        else:
-            self.vision_config = PretrainedConfig()
-
-        if llm_config is None:
-            self.text_config = LlamaConfig()
-        else:
-            self.text_config = LlamaConfig(**llm_config)
-
-        # Assign configuration values
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template  # TODO move out of here and into the tokenizer
-        self.ps_version = ps_version  # Pixel shuffle version
-        self.image_tag_type = image_tag_type  # TODO: into the tokenizer too?
-        self.projector_hidden_size = projector_hidden_size
-        self.vit_hidden_size = vit_hidden_size
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 8ba3aec454ad7..b3469c1b18f2d 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib
+import inspect
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, cast, get_args, get_type_hints
 
 from transformers import (
     AutoFeatureExtractor,
@@ -55,6 +57,23 @@ def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
     return processor_cls
 
 
+@lru_cache
+def _collect_dynamic_keys_from_processing_kwargs(kwargs_cls: type) -> set[str]:
+    dynamic_kwargs: set[str] = set()
+    if kwargs_cls is None:
+        return dynamic_kwargs
+    # get kwargs annotations in processor
+    # merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
+    kwargs_type_annotations = get_type_hints(kwargs_cls)
+    for kw_type in ("text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"):
+        if kw_type in kwargs_type_annotations:
+            kw_annotations = get_type_hints(kwargs_type_annotations[kw_type])
+            for kw_name in kw_annotations:
+                dynamic_kwargs.add(kw_name)
+    dynamic_kwargs |= {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
+    return dynamic_kwargs
+
+
 def _merge_mm_kwargs(
     model_config: "ModelConfig",
     processor_cls: type | tuple[type, ...],
@@ -71,7 +90,6 @@ def _merge_mm_kwargs(
         requires_kw_only=False,
         allow_var_kwargs=True,
     )
-
     # NOTE: Pythonic dict is not hashable and will raise unhashable type
     # error when calling `cached_get_processor`, therefore we need to
     # wrap it to a hashable dict.
@@ -145,12 +163,80 @@ def get_processor(
 cached_get_processor = lru_cache(get_processor)
 
 
+@lru_cache
+def get_processor_kwargs_from_processor(processor: _P) -> set[str]:
+    try:
+        # get kwargs annotations in processor
+        call_kwargs = inspect.signature(type(processor).__call__).parameters.get(
+            "kwargs"
+        )
+        call_kwargs_annotations = call_kwargs.annotation if call_kwargs else None
+        # if the processor has explicit kwargs annotation, use it
+        if call_kwargs_annotations not in (None, inspect._empty):
+            # get_type_hints will parse all type annotations at runtime,
+            # and if an annotation refers to a type or
+            # name that hasn’t been imported or defined, it will raise an error.
+            # So we use __annotations__ to get the raw annotations directly.
+            return _collect_dynamic_keys_from_processing_kwargs(
+                get_args(call_kwargs_annotations)[0]
+            )
+        # otherwise, try to get from ProcessingKwargs
+        else:
+            module_name = type(processor).__module__
+            mod = importlib.import_module(module_name)
+            # find *ProcessingKwargs in the module
+            processor_kwargs: set[str] = set()
+            for name, obj in vars(mod).items():
+                if name.endswith("ProcessingKwargs"):
+                    processor_kwargs = (
+                        processor_kwargs
+                        | _collect_dynamic_keys_from_processing_kwargs(obj)
+                    )
+            return processor_kwargs
+    except Exception:
+        return set()
+
+
+def cached_get_processor_without_dynamic_kwargs(
+    processor_name: str,
+    *args: Any,
+    revision: str | None = None,
+    trust_remote_code: bool = False,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    # Step 1: use default kwargs to get a temporary processor instance
+    processor = cached_get_processor(
+        processor_name,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+    )
+
+    # Step 2: use temporary processor collect dynamic keys
+    dynamic_keys = get_processor_kwargs_from_processor(processor)
+
+    # Step 3: use dynamic_keys filter kwargs
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k not in dynamic_keys}
+
+    # Step 4: use filtered kwargs to get final processor instance
+    final_processor = cached_get_processor(
+        processor_name,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+        **filtered_kwargs,
+    )
+
+    return final_processor
+
+
 def cached_processor_from_config(
     model_config: "ModelConfig",
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    return cached_get_processor(
+    return cached_get_processor_without_dynamic_kwargs(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6f710bf23360f..39198a1f3d815 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -121,8 +121,8 @@ def _prepare_apply_chat_template_tools_and_messages(
     #
     # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
     for message in messages:
-        # Remove reasoning_content as unsupported by Mistral
-        _ = message.pop("reasoning_content", None)  # type: ignore
+        # Remove reasoning as unsupported by Mistral
+        _ = message.pop("reasoning", None)  # type: ignore
 
     # The Mistral client, in comparison to the OpenAI client, requires the
     # "parameters" dict and the "description" string to be present
@@ -165,6 +165,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
 
 class MistralTokenizer(TokenizerBase):
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+        from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
         )
@@ -175,6 +176,14 @@ class MistralTokenizer(TokenizerBase):
         self.instruct = self.mistral.instruct_tokenizer
         self.tokenizer = self.instruct.tokenizer
 
+        mode = self.mistral._chat_completion_request_validator._mode
+        if mode != ValidationMode.test:
+            raise ValueError(
+                "Mistral tokenizer must be in test mode. Make sure to "
+                "set `mode='ValidationMode.test'` when creating the "
+                "Mistral tokenizer."
+            )
+
         _mistral_version_str = str(self.tokenizer.version.value)
         self.version: int = int(_mistral_version_str.split("v")[-1])
 
@@ -191,6 +200,12 @@ class MistralTokenizer(TokenizerBase):
         # Sort the dict for convenience
         self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
 
+        # Cache special tokens for faster access.
+        self._special_token_ids = self._get_special_token_ids()
+        self._special_token_ids_set = set(self._special_token_ids)
+        self._special_tokens = self._get_special_tokens(self._special_token_ids)
+        self._special_tokens_set = set(self._special_tokens)
+
         # Vocab sorted by token id.
         self._vocab = self.tokenizer._vocab
         self._max_token_id = self.vocab_size - 1
@@ -199,6 +214,7 @@ class MistralTokenizer(TokenizerBase):
     def from_pretrained(
         cls, path_or_repo_id: str, *, revision: str | None = None
     ) -> "MistralTokenizer":
+        from mistral_common.protocol.instruct.validator import ValidationMode
         from transformers.tokenization_mistral_common import (
             MistralCommonTokenizer as TransformersMistralTokenizer,
         )
@@ -206,27 +222,11 @@ class MistralTokenizer(TokenizerBase):
         str_revision = "main" if revision is None else revision
         return cls(
             TransformersMistralTokenizer.from_pretrained(
-                path_or_repo_id, revision=str_revision
+                path_or_repo_id, revision=str_revision, mode=ValidationMode.test
             )
         )
 
-    # the following attributes are set to fit vLLM's design and are used
-    # by the structured output backends.
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        return self.all_special_tokens
-
-    @property
-    def all_special_tokens(self) -> list[str]:
-        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
-
-        return [
-            self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
-            for i in self.all_special_ids
-        ]
-
-    @property
-    def all_special_ids(self) -> list[int]:
+    def _get_special_token_ids(self) -> list[int]:
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
         )
@@ -244,6 +244,28 @@ class MistralTokenizer(TokenizerBase):
             raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}")
         return sorted(special_ids)
 
+    def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
+
+        return [
+            self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
+            for i in all_special_ids
+        ]
+
+    # the following attributes are set to fit vLLM's design and are used
+    # by the structured output backends.
+    @property
+    def all_special_tokens_extended(self) -> list[str]:
+        return self.all_special_tokens
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self._special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self._special_token_ids
+
     @property
     def bos_token_id(self) -> int:
         return self.tokenizer.bos_id
@@ -277,21 +299,7 @@ class MistralTokenizer(TokenizerBase):
         raise NotImplementedError()
 
     def _is_special_token_id(self, token_id: int) -> bool:
-        from mistral_common.tokens.tokenizers.sentencepiece import (
-            SentencePieceTokenizer,
-        )
-        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-
-        if self.is_spm:
-            assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
-                self.tokenizer
-            )
-            return token_id in self.tokenizer._control_tokens
-        if self.is_tekken:
-            assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
-            return token_id < self.tokenizer.num_special_tokens
-        else:
-            raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}")
+        return token_id in self._special_token_ids_set
 
     def __len__(self) -> int:
         return self.vocab_size
@@ -341,20 +349,14 @@ class MistralTokenizer(TokenizerBase):
         max_length: int | None = None,
         add_special_tokens: bool | None = None,
     ) -> list[int]:
-        if add_special_tokens is not None:
-            return self.transformers_tokenizer.encode(
-                text,
-                truncation=truncation,
-                max_length=max_length,
-                add_special_tokens=add_special_tokens,
-            )
-        else:
-            encoded = self.tokenizer.encode(text, bos=True, eos=False)
+        encoded = self.tokenizer.encode(
+            text, bos=add_special_tokens is not False, eos=False
+        )
 
-            if truncation is not False and max_length is not None:
-                return encoded[:max_length]
-            else:
-                return encoded
+        if truncation is not False and max_length is not None:
+            return encoded[:max_length]
+        else:
+            return encoded
 
     def apply_chat_template(
         self,
@@ -385,6 +387,9 @@ class MistralTokenizer(TokenizerBase):
         )
 
     def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        if isinstance(ids, int):
+            ids = [ids]
+
         return self.transformers_tokenizer.decode(
             ids, skip_special_tokens=skip_special_tokens
         )
@@ -405,7 +410,7 @@ class MistralTokenizer(TokenizerBase):
             tokens = [
                 t
                 for t in tokens
-                if (t in to_decode_special_tokens or t not in self.all_special_tokens)
+                if (t in to_decode_special_tokens or t not in self._special_tokens_set)
             ]
 
             if any(isinstance(t, bytes) for t in tokens):
@@ -489,7 +494,7 @@ class MistralTokenizer(TokenizerBase):
             # We filtered unwanted special tokens so we can decode the rest.
             tokens = [
                 self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP)
-                if token_id not in self.all_special_ids
+                if token_id not in self._special_token_ids_set
                 else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP)
                 for token_id in ids_kept
             ]
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index af2df195f2958..1ae42ba622dc4 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -19,6 +19,14 @@ def is_s3(model_or_path: str) -> bool:
     return model_or_path.lower().startswith("s3://")
 
 
+def is_gcs(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith("gs://")
+
+
+def is_cloud_storage(model_or_path: str) -> bool:
+    return is_s3(model_or_path) or is_gcs(model_or_path)
+
+
 def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
diff --git a/vllm/utils/cache.py b/vllm/utils/cache.py
index d5e08caa8a1ed..4338983f90601 100644
--- a/vllm/utils/cache.py
+++ b/vllm/utils/cache.py
@@ -3,7 +3,7 @@
 from collections import UserDict
 from collections.abc import Callable, Hashable, Iterator, KeysView, Mapping
 from types import MappingProxyType
-from typing import Generic, NamedTuple, TypeVar, cast, overload
+from typing import NamedTuple, TypeVar, cast, overload
 
 import cachetools
 
@@ -48,7 +48,7 @@ class CacheInfo(NamedTuple):
         )
 
 
-class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
+class LRUCache(cachetools.LRUCache[_K, _V]):
     def __init__(self, capacity: float, getsizeof: Callable[[_V], float] | None = None):
         super().__init__(capacity, getsizeof)
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index d7e4ea2e03884..5101020fda12f 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -19,6 +19,9 @@ import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -40,9 +43,13 @@ def has_flashinfer() -> bool:
     if importlib.util.find_spec("flashinfer") is None:
         logger.debug_once("FlashInfer unavailable since package was not found")
         return False
+    # When not using flashinfer cubin,
     # Also check if nvcc is available since it's required to JIT compile flashinfer
-    if shutil.which("nvcc") is None:
-        logger.debug_once("FlashInfer unavailable since nvcc was not found")
+    if not envs.VLLM_HAS_FLASHINFER_CUBIN and shutil.which("nvcc") is None:
+        logger.debug_once(
+            "FlashInfer unavailable since nvcc was not found "
+            "and not using pre-downloaded cubins"
+        )
         return False
     return True
 
@@ -222,6 +229,9 @@ def force_use_trtllm_attention() -> bool | None:
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
     """
+    if vllm_is_batch_invariant():
+        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is disabled for batch-invariant")
+        return False
     return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
 
 
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 4dd85ef26f34a..160ac9ac263a9 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -89,6 +89,21 @@ class GCDebugger:
             )
 
 
+def freeze_gc_heap() -> None:
+    """
+    Freeze all objects tracked by the garbage collector. It should be invoked
+    after server init / warmup, to reduce GC overhead from static objects
+    during serving time.
+    """
+    # Ensure all static objects are pushed down to the oldest generation for
+    # freeze
+    gc.collect(0)
+    gc.collect(1)
+    gc.collect(2)
+    # Freeze all GC tracked objects
+    gc.freeze()
+
+
 def maybe_attach_gc_debug_callback() -> None:
     """
     Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index 409a5a6cd302d..f01d2c7a6a33d 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -403,3 +403,9 @@ def has_triton_kernels() -> bool:
 def has_tilelang() -> bool:
     """Whether the optional `tilelang` package is available."""
     return _has_module("tilelang")
+
+
+def has_arctic_inference() -> bool:
+    """Whether the optional `arctic_inference` package is available."""
+
+    return _has_module("arctic_inference")
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
new file mode 100644
index 0000000000000..ac9b859159ead
--- /dev/null
+++ b/vllm/utils/registry.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+
+class ExtensionManager:
+    """
+    A registry for managing pluggable extension classes.
+
+    This class provides a simple mechanism to register and instantiate
+    extension classes by name. It is commonly used to implement plugin
+    systems where different implementations can be swapped at runtime.
+
+    Examples:
+        Basic usage with a registry instance:
+
+        >>> FOO_REGISTRY = ExtensionManager()
+        >>> @FOO_REGISTRY.register("my_foo_impl")
+        ... class MyFooImpl(Foo):
+        ...     def __init__(self, value):
+        ...         self.value = value
+        >>> foo_impl = FOO_REGISTRY.load("my_foo_impl", value=123)
+
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize an empty extension registry.
+        """
+        self.name2class: dict[str, type] = {}
+
+    def register(self, name: str):
+        """
+        Decorator to register a class with the given name.
+        """
+
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+
+        return wrap
+
+    def load(self, cls_name: str, *args, **kwargs) -> Any:
+        """
+        Instantiate and return a registered extension class by name.
+        """
+        cls = self.name2class.get(cls_name)
+        assert cls is not None, f"Extension class {cls_name} not found"
+        return cls(*args, **kwargs)
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index adcacb34cb7c0..fd5c1b73f1910 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import importlib.metadata
+import os
 import threading
 from collections.abc import Callable, Collection
 from functools import lru_cache
@@ -68,6 +69,33 @@ def set_default_torch_num_threads(num_threads: int):
     torch.set_num_threads(old_num_threads)
 
 
+@contextlib.contextmanager
+def guard_cuda_initialization():
+    """Avoid unexpected CUDA initialization."""
+    from vllm.platforms import current_platform
+
+    if not current_platform.is_cuda():
+        yield
+        return
+
+    had_key = "CUDA_VISIBLE_DEVICES" in os.environ
+    old_value = os.environ.get("CUDA_VISIBLE_DEVICES")
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    try:
+        yield
+    except Exception as e:
+        if "No CUDA GPUs are available" in str(e):
+            err_msg = "CUDA initialization is blocked."
+        else:
+            err_msg = str(e)
+        raise RuntimeError(err_msg) from e
+    finally:
+        if had_key:
+            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
+        else:
+            os.environ.pop("CUDA_VISIBLE_DEVICES")
+
+
 def get_dtype_size(dtype: torch.dtype) -> int:
     """Get the size of the data type in bytes."""
     return torch.tensor([], dtype=dtype).element_size()
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 0d3e1729ff208..20d987fa2de3b 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -66,10 +66,6 @@ class TorchSDPABackend(AttentionBackend):
     def get_impl_cls() -> type["TorchSDPABackendImpl"]:
         return TorchSDPABackendImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TorchSDPAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
         return TorchSDPAMetadataBuilderV1
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1eac94940e781..15bb2f4a40acb 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -11,7 +11,6 @@ from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     MultipleOf,
     is_quantized_kv_cache,
@@ -27,6 +26,7 @@ from vllm.attention.utils.fa_utils import (
 
 if is_flash_attn_varlen_func_available():
     from vllm.attention.utils.fa_utils import (
+        flash_attn_supports_sinks,
         flash_attn_varlen_func,
         get_scheduler_metadata,
         reshape_and_cache_flash,
@@ -42,6 +42,7 @@ from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    get_dcp_local_seq_lens,
     get_kv_cache_layout,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -62,7 +63,11 @@ class FlashAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
+        # NOTE(tdoublep): while in principle, FA supports
+        # MultipleOf(16), these are the block sizes that do not
+        # suffer from the NaN propagation problem described here:
+        # https://github.com/Dao-AILab/flash-attention/issues/1974
+        return [16, 32, 64]
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
@@ -84,10 +89,6 @@ class FlashAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return FlashAttentionMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
         return FlashAttentionMetadataBuilder
@@ -233,19 +234,16 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
             self.dcp_world_size = 1
             self.dcp_rank = 0
 
+        self.dcp_kv_cache_interleave_size = (
+            self.parallel_config.dcp_kv_cache_interleave_size
+        )
+
         self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         )
         self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
 
         if self.use_full_cuda_graph and self.aot_schedule:
-            if self.max_cudagraph_size > 992:
-                # This condition derives from FA3's internal heuristic.
-                # TODO(woosuk): Support larger cudagraph sizes.
-                raise ValueError(
-                    "Capture size larger than 992 is not supported for full cuda graph."
-                )
-
             self.scheduler_metadata = torch.zeros(
                 vllm_config.scheduler_config.max_num_seqs + 1,
                 dtype=torch.int32,
@@ -354,8 +352,12 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
                 - common_attn_metadata.query_start_loc_cpu[:-1]
             )
             dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
-            dcp_context_kv_lens_cpu = dcp_context_kv_lens_cpu // self.dcp_world_size + (
-                self.dcp_rank <= (dcp_context_kv_lens_cpu - 1) % self.dcp_world_size
+
+            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
+                dcp_context_kv_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
             )
             dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
             max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
@@ -493,7 +495,7 @@ class FlashAttentionImpl(AttentionImpl):
 
         self.sinks = sinks
         if self.sinks is not None:
-            assert self.vllm_flash_attn_version == 3, (
+            assert flash_attn_supports_sinks(), (
                 "Sinks are only supported in FlashAttention 3"
             )
             assert self.sinks.shape[0] == num_heads, (
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index e71d4ca4629dc..683725b95819f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -195,10 +195,6 @@ class FlashInferBackend(AttentionBackend):
     def get_impl_cls() -> type["FlashInferImpl"]:
         return FlashInferImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["FlashInferMetadata"]:
-        return FlashInferMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashInferMetadataBuilder"]:
         return FlashInferMetadataBuilder
@@ -402,6 +398,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
         self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
 
+        if self.head_dim == 256 and current_platform.is_device_capability(100):
+            # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
+            # head size 256 and block size 16 is not supported on blackwell.
+            assert kv_cache_spec.block_size != 16, (
+                "There is a bug in FlashInfer "
+                "block_size 16 head size 256 support. Please avoid this combination by "
+                "passing --block-size 32 or --block-size 64."
+            )
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 928252636d583..9af63831cecba 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -20,7 +20,6 @@ from torch.nn.attention.flex_attention import (
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     is_quantized_kv_cache,
 )
@@ -89,10 +88,6 @@ class FlexAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return FlexAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 30c63e0ded8e7..909af09be255a 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -7,11 +7,13 @@ import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import VllmConfig
 from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata,
     split_decodes_and_prefills,
 )
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class Mamba1AttentionBackend(AttentionBackend):
@@ -22,32 +24,41 @@ class Mamba1AttentionBackend(AttentionBackend):
 
 @dataclass
 class Mamba1AttentionMetadata:
-    query_start_loc: torch.Tensor
-    context_lens_tensor: torch.Tensor
+    query_start_loc_p: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states: torch.Tensor | None
+    has_initial_states_p: torch.Tensor | None
     num_prefills: int
     num_prefill_tokens: int
     num_decodes: int
     num_decode_tokens: int
     num_padded_decodes: int
 
+    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
+    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
+    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
+    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
+
 
 class Mamba1AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
 ):
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+        assert isinstance(kv_cache_spec, MambaSpec)
+
     def build(
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> Mamba1AttentionMetadata:
-        query_start_loc = common_attn_metadata.query_start_loc
-
-        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
-        context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
-            query_start_loc.device
-        )
+        num_reqs = common_attn_metadata.num_reqs
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
@@ -55,32 +66,100 @@ class Mamba1AttentionMetadataBuilder(
             )
         )
 
-        has_initial_states = None
+        has_initial_states_p = None
+        query_start_loc_p = None
         padded_decodes = num_decodes
+        num_computed_tokens, num_computed_tokens_p = None, None
+        block_idx_first_scheduled_token = None
+        block_idx_first_scheduled_token_p = None
+
+        # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
+        # We should consolidate this code
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            # Return a tensor of shape (#requests, #max blocks)
+            state_indices_tensor = common_attn_metadata.block_table_tensor
+            mamba_block_size = self.kv_cache_spec.block_size
+            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+                self.device
+            )
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
+            )
+        else:
+            # Always return just a single block per each request:
+            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+            block_idx_last_scheduled_token = None
+            block_idx_last_computed_token = None
 
         if num_prefills > 0:
-            has_initial_states = context_lens_tensor > 0
+            query_start_loc_p = (
+                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
+                - num_decode_tokens
+            )
+            has_initial_states_cpu = (
+                common_attn_metadata.num_computed_tokens_cpu[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                > 0
+            )
+            has_initial_states_p = has_initial_states_cpu.to(
+                common_attn_metadata.query_start_loc.device
+            )
+
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                assert num_computed_tokens is not None
+                num_computed_tokens_p = num_computed_tokens[
+                    num_reqs - num_prefills : num_reqs
+                ]
+                assert block_idx_first_scheduled_token is not None
+                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
+                    num_reqs - num_prefills : num_reqs
+                ]
+
         elif (
             num_decodes > 0
             and num_decodes <= self.decode_cudagraph_max_bs
             and self.compilation_config.full_cuda_graph
         ):
-            state_indices_for_decode = state_indices_tensor[:num_decodes]
             padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
-                state_indices_for_decode, non_blocking=True
+                state_indices_tensor, non_blocking=True
             )
             state_indices_tensor = self.state_indices_tensor[:padded_decodes]
             state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
+            if self.vllm_config.cache_config.enable_prefix_caching:
+                self.block_idx_last_scheduled_token[:num_decodes].copy_(
+                    block_idx_last_scheduled_token, non_blocking=True
+                )
+                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
+                    :padded_decodes
+                ]
+                block_idx_last_scheduled_token[num_decodes:] = 0
+
+                self.block_idx_last_computed_token[:num_decodes].copy_(
+                    block_idx_last_computed_token, non_blocking=True
+                )
+                block_idx_last_computed_token = self.block_idx_last_computed_token[
+                    :padded_decodes
+                ]
+                block_idx_last_computed_token[num_decodes:] = 0
+
         return Mamba1AttentionMetadata(
-            query_start_loc=query_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            has_initial_states=has_initial_states,
+            query_start_loc_p=query_start_loc_p,
+            has_initial_states_p=has_initial_states_p,
             state_indices_tensor=state_indices_tensor,
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
             num_padded_decodes=padded_decodes,
+            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
+            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
+            block_idx_last_computed_token=block_idx_last_computed_token,
+            num_computed_tokens_p=num_computed_tokens_p,
         )
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index f9d2426eaf632..4bc1057333a50 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -147,27 +147,6 @@ class Mamba2AttentionMetadataBuilder(
         assert self.chunk_size is not None, (
             "chunk_size needs to be set in the model config for Mamba2 models"
         )
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            self.state_indices_tensor = torch.empty(
-                (
-                    self.decode_cudagraph_max_bs,
-                    cdiv(
-                        vllm_config.model_config.max_model_len, kv_cache_spec.block_size
-                    ),
-                ),
-                dtype=torch.int32,
-                device=device,
-            )
-            self.block_idx_last_scheduled_token = torch.empty(
-                (self.decode_cudagraph_max_bs,),
-                dtype=torch.int32,
-                device=device,
-            )
-            self.block_idx_last_computed_token = torch.empty(
-                (self.decode_cudagraph_max_bs,),
-                dtype=torch.int32,
-                device=device,
-            )
 
     def build(
         self,
@@ -202,20 +181,13 @@ class Mamba2AttentionMetadataBuilder(
             num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
                 self.device
             )
-            # Block index of the last computed token
-            block_idx_last_computed_token = (
-                cdiv(num_computed_tokens, mamba_block_size) - 1
+            (
+                block_idx_last_computed_token,
+                block_idx_first_scheduled_token,
+                block_idx_last_scheduled_token,
+            ) = self._compute_prefix_caching_block_indices(
+                common_attn_metadata, mamba_block_size
             )
-            # which is <= block index for the first scheduled token
-            block_idx_first_scheduled_token = (
-                cdiv(num_computed_tokens + 1, mamba_block_size) - 1
-            )
-            # which is <= block index of the last scheduled token
-            block_idx_last_scheduled_token = (
-                cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1
-            )
-            # -1 in case it's non-computed and causes later issues with indexing
-            block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
         else:
             # Always return just a single block per each request:
             state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 52f26a9e61cab..49d7d6c31b9a0 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -7,6 +7,7 @@ from typing import ClassVar, TypeVar
 import torch
 
 from vllm.config import VllmConfig
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -38,11 +39,35 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
             self.vllm_config.scheduler_config.max_num_seqs,
             self.compilation_config.max_cudagraph_capture_size,
         )
-        self.state_indices_tensor = torch.empty(
-            (self.decode_cudagraph_max_bs,),
-            dtype=torch.int32,
-            device=device,
-        )
+
+        if self.vllm_config.cache_config.enable_prefix_caching:
+            self.state_indices_tensor = torch.empty(
+                (
+                    self.decode_cudagraph_max_bs,
+                    cdiv(
+                        self.vllm_config.model_config.max_model_len,
+                        self.kv_cache_spec.block_size,
+                    ),
+                ),
+                dtype=torch.int32,
+                device=device,
+            )
+            self.block_idx_last_scheduled_token = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
+            self.block_idx_last_computed_token = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
+        else:
+            self.state_indices_tensor = torch.empty(
+                (self.decode_cudagraph_max_bs,),
+                dtype=torch.int32,
+                device=device,
+            )
 
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
@@ -61,3 +86,30 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
         m.max_query_len = 1  # decode-only
 
         return self.build(0, m)
+
+    def _compute_prefix_caching_block_indices(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        mamba_block_size: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
+            self.device
+        )
+        # Block index of the last computed token
+        block_idx_last_computed_token = cdiv(num_computed_tokens, mamba_block_size) - 1
+        # which is <= block index for the first scheduled token
+        block_idx_first_scheduled_token = (
+            cdiv(num_computed_tokens + 1, mamba_block_size) - 1
+        )
+        # which is <= block index of the last scheduled token
+        block_idx_last_scheduled_token = (
+            cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1
+        )
+        # -1 in case it's non-computed and causes later issues with indexing
+        block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+
+        return (
+            block_idx_last_computed_token,
+            block_idx_first_scheduled_token,
+            block_idx_last_scheduled_token,
+        )
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0ec1573004197..e38f7bcfa44e1 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -198,10 +198,10 @@ from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionLayer,
-    AttentionMetadata,
     MLAAttentionImpl,
 )
 from vllm.attention.backends.utils import get_mla_dims
@@ -225,6 +225,7 @@ from vllm.utils.math_utils import cdiv, round_down
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    get_dcp_local_seq_lens,
     get_per_layer_parameters,
     infer_global_hyperparameters,
     split_decodes_and_prefills,
@@ -270,28 +271,15 @@ except ImportError:
     flashinfer_available = False
 
 
-def is_rocm_aiter_fp8bmm_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER_FP8BMM
-        and envs.VLLM_ROCM_USE_AITER
-    )
-
-
-if is_rocm_aiter_fp8bmm_enabled():
-    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,  # noqa: E501
-    )
-
-    def dynamic_per_batched_tensor_quant(
-        x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
-    ):
-        DTYPE_MAX = torch.finfo(dtype).max
-        min_val, max_val = x.aminmax()
-        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
-        scale = DTYPE_MAX / amax
-        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
-        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+def dynamic_per_batched_tensor_quant(
+    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+):
+    DTYPE_MAX = torch.finfo(dtype).max
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+    scale = DTYPE_MAX / amax
+    x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
 logger = init_logger(__name__)
@@ -306,10 +294,6 @@ class MLACommonBackend(AttentionBackend):
     def get_name() -> str:
         return "TRITON_MLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return MLACommonMetadata
-
     @staticmethod
     def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
         return MLACommonMetadataBuilder
@@ -361,10 +345,9 @@ class MLACommonPrefillMetadata:
         workspace: torch.Tensor
 
         # for mla DCP
-        cp_chunk_seq_lens: list[list[int]] | None = None
-        origin_context_lens: list[int] | None = None
-        cp_cu_seq_lens: torch.Tensor | None = None
-        chunk_size: int | None = None
+        padded_local_chunk_seq_lens: list[list[int]] | None = None
+        local_context_lens_allranks: list[list[int]] | None = None
+        padded_local_cu_seq_lens: torch.Tensor | None = None
         cu_seq_lens_lst: list[list[int]] | None = None
 
     block_table: torch.Tensor
@@ -545,6 +528,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         vllm_config: VllmConfig,
         device: torch.device,
         metadata_cls: type[M] | None = None,
+        supports_dcp_with_varlen: bool = False,
     ):
         self.metadata_cls = (
             metadata_cls if metadata_cls is not None else MLACommonMetadata
@@ -567,6 +551,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size
+        self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
 
         # Don't try to access the runner on AMD
         if self.aot_schedule:
@@ -638,7 +624,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
         self._init_reorder_batch_threshold(
-            self.reorder_batch_threshold, supports_spec_decode
+            self.reorder_batch_threshold, supports_spec_decode, supports_dcp_with_varlen
         )
 
         # Validate consistency between query_len_support and reorder_batch_threshold
@@ -793,15 +779,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             )
         )
 
-        # Note(hc): update seq_lens of decode reqs under DCP.
-        if self.dcp_world_size > 1:
-            assert dcp_local_seq_lens is not None
-            dcp_local_seq_lens[:num_decodes] = seq_lens[
-                :num_decodes
-            ] // self.dcp_world_size + (
-                self.dcp_rank <= (seq_lens[:num_decodes] - 1) % self.dcp_world_size
-            )
-
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
 
@@ -810,11 +787,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
-            # Note(hc): The context lengths in the perspective of dcp rank0.
-            cp_context_lens_cpu = torch.ceil(
-                context_lens_cpu.float() / self.dcp_world_size
-            ).int()
-            origin_context_lens = context_lens_cpu.tolist()
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
             prefill_query_start_loc = (
@@ -870,32 +842,56 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 )
 
                 if self.dcp_world_size > 1:
+                    local_context_lens_allranks = get_dcp_local_seq_lens(
+                        context_lens_cpu,
+                        self.dcp_world_size,
+                        None,
+                        self.dcp_local_block_size,
+                    )
+                    # Note(qcs): The max local context lengths
+                    # padded to `dcp_local_block_size`.
+                    padded_local_context_lens_cpu = (
+                        cdiv(
+                            context_lens_cpu,
+                            self.dcp_virtual_block_size,
+                        )
+                        * self.dcp_local_block_size
+                    )
                     # Note(hc): The above max_context_chunk already enforces
                     # block_size alignment, DCP just need the block_size can
                     # be divisible by dcp_world_size, because DCP use
                     # cp_gather_cache which not require `cp_chunk_starts`
                     # aligned to page_size.
                     assert max_context_chunk % self.dcp_world_size == 0
-                    cp_max_context_chunk = max_context_chunk // self.dcp_world_size
-                    cp_chunk_starts = (
+                    padded_local_max_context_chunk_across_ranks = (
+                        cdiv(
+                            max_context_chunk,
+                            self.dcp_virtual_block_size,
+                        )
+                        * self.dcp_local_block_size
+                    )
+                    local_chunk_starts = (
                         torch.arange(num_chunks, dtype=torch.int32)
                         .unsqueeze(1)
                         .expand(-1, num_prefills)
-                        * cp_max_context_chunk
+                        * padded_local_max_context_chunk_across_ranks
                     )
-                    cp_chunk_ends = torch.min(
-                        cp_context_lens_cpu.unsqueeze(0),
-                        cp_chunk_starts + cp_max_context_chunk,
+                    local_chunk_ends = torch.min(
+                        padded_local_context_lens_cpu.unsqueeze(0),
+                        local_chunk_starts
+                        + padded_local_max_context_chunk_across_ranks,
                     )
-                    cp_chunk_seq_lens = (cp_chunk_ends - cp_chunk_starts).clamp(min=0)
+                    padded_local_chunk_seq_lens = (
+                        local_chunk_ends - local_chunk_starts
+                    ).clamp(min=0)
 
-                    cp_cu_seq_lens_cpu = torch.zeros(
+                    padded_local_cu_chunk_seq_lens_cpu = torch.zeros(
                         num_chunks, num_prefills + 1, dtype=torch.int32, pin_memory=True
                     )
                     torch.cumsum(
-                        cp_chunk_seq_lens,
+                        padded_local_chunk_seq_lens,
                         dim=1,
-                        out=cp_cu_seq_lens_cpu[:, 1:],
+                        out=padded_local_cu_chunk_seq_lens_cpu[:, 1:],
                         dtype=torch.int32,
                     )
 
@@ -907,15 +903,16 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 if self.dcp_world_size > 1:
                     chunked_context_metadata = chunked_context_metadata_cls(
                         cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                        starts=cp_chunk_starts.to(device, non_blocking=True),
-                        seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
+                        starts=local_chunk_starts.to(device, non_blocking=True),
+                        seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
                         workspace=self.chunked_prefill_workspace,
-                        cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
-                        origin_context_lens=origin_context_lens,
-                        cp_cu_seq_lens=cp_cu_seq_lens_cpu.to(device, non_blocking=True),
-                        chunk_size=max_context_chunk,
+                        padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
+                        local_context_lens_allranks=local_context_lens_allranks.tolist(),
+                        padded_local_cu_seq_lens=padded_local_cu_chunk_seq_lens_cpu.to(
+                            device, non_blocking=True
+                        ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
                     )
                 else:
@@ -997,64 +994,52 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 def reorg_kvcache(
     allgatered_kv_c_normed: torch.Tensor,
     allgatered_k_pe: torch.Tensor,
-    cp_chunk_seq_lens_lst: list[int],
-    origin_context_lens: list[int],
-    cp_world_size: int,
+    padded_local_chunk_seq_lens_lst: list[int],
+    local_context_lens_allranks: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
-    chunk_size: int,
-    chunk_idx: int,
     toks: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
-    reorg kvcache after cp local gather to tp layout for attn kernel.
-
+    reorg and unpad kvcache after cp local gather to tp layout for attn kernel.
+    e.g.
+    allgatered_kv_c_normed = [T0_0, T0_1, T0_2, T0_3, T1_0, T1_1, ...,
+                              T0_4, T0_5, pad, pad, T1_2, pad, ...]
+    -> reorganized_kv_c_normed = [T0_0, T0_1, T0_2, T0_3, T0_4, T0_5,
+                                  T1_0, T1_1, T1_2, ...]
     Args:
-        cp_chunk_seq_lens_lst: chunk context lengths under CP.
-        origin_context_lens: origin full context lengths under CP.
-        cp_world_size: CP size.
+        padded_local_chunk_seq_lens_lst: local chunk context lengths
+            under current CP rank.
+        local_context_lens_allranks: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
-        chunk_size: equals to max_context_chunk from
-            chunked_context_metadata building.
-        chunk_idx: chunk idx of chunked_prefill.
         toks: the number of tokens for local gather cache.
     """
     kv_c_segments = []
     k_pe_segments = []
     src_token_idx = 0
     max_seq_len_check = 0
-    for cp_chunk_seq_len, origin_context_len in zip(
-        cp_chunk_seq_lens_lst, origin_context_lens
+    for padded_local_chunk_seq_len, local_context_lens in zip(
+        padded_local_chunk_seq_lens_lst, local_context_lens_allranks
     ):
-        chunk_context_len = chunk_size
-        if cp_chunk_seq_len != 0:
-            chunk_context_len = min(
-                chunk_context_len, origin_context_len - chunk_size * chunk_idx
-            )
-        cp_target_rank = (chunk_context_len - 1) % cp_world_size
         cur_seq_len = 0
-        for rank in range(cp_world_size):
-            if rank > cp_target_rank and cp_chunk_seq_len:
-                real_cp_chunk_seq_len = cp_chunk_seq_len - 1
-            else:
-                real_cp_chunk_seq_len = cp_chunk_seq_len
-            if real_cp_chunk_seq_len:
+        for rank, local_context_len in enumerate(local_context_lens):
+            if local_context_len != 0:
                 kv_c_segment = allgatered_kv_c_normed[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + real_cp_chunk_seq_len
+                    + local_context_len
                 ]
                 k_pe_segment = allgatered_k_pe[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + real_cp_chunk_seq_len
+                    + local_context_len
                 ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
-                cur_seq_len += real_cp_chunk_seq_len
+                cur_seq_len += local_context_len
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
-        src_token_idx += cp_chunk_seq_len
+        src_token_idx += padded_local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
     reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
     assert reorganized_kv_c_normed.shape[0] == sum_seq_len
@@ -1112,6 +1097,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         self.kv_b_proj = kv_b_proj
         self.indexer = indexer
         self.q_pad_num_heads = q_pad_num_heads
+        self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         def get_layer_weight(layer):
@@ -1161,7 +1147,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1
         )
 
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
@@ -1190,7 +1176,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
                     dtype=torch.bfloat16,
                     device=self.W_K.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
                 )
 
@@ -1199,7 +1185,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
                     dtype=torch.bfloat16,
                     device=self.W_V.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
                 )
         else:
@@ -1211,10 +1197,9 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
     def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
-            x = aiter_triton_fp8_bmm(
+            x = rocm_aiter_ops.triton_fp8_bmm(
                 x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
             )
             # Convert from (B, N, V) to (B, N * V)
@@ -1295,6 +1280,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 get_current_vllm_config()
             )
         )
+        self.dcp_kv_cache_interleave_size: int = (
+            get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size
+        )
 
     def _flash_attn_varlen_diff_headdims(
         self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
@@ -1571,7 +1559,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1
         )
 
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
@@ -1600,7 +1588,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                     dtype=torch.bfloat16,
                     device=self.W_K.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
                 )
 
@@ -1609,7 +1597,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                     dtype=torch.bfloat16,
                     device=self.W_V.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
                 )
         else:
@@ -1696,10 +1684,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
         assert prefill_metadata.chunked_context is not None
-        assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
-        assert prefill_metadata.chunked_context.origin_context_lens is not None
-        assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
-        assert prefill_metadata.chunked_context.chunk_size is not None
+        assert prefill_metadata.chunked_context.padded_local_chunk_seq_lens is not None
+        assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
+        assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None
         assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
 
         output = None
@@ -1712,7 +1699,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
-                cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
+                cu_seq_lens=prefill_metadata.chunked_context.padded_local_cu_seq_lens[
+                    i
+                ],
                 batch_size=attn_metadata.num_prefills,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
             )
@@ -1742,15 +1731,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             kv_c_normed, k_pe = reorg_kvcache(
                 allgatered_kv_c_normed,
                 allgatered_k_pe,
-                cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.cp_chunk_seq_lens[
+                padded_local_chunk_seq_lens_lst=prefill_metadata.chunked_context.padded_local_chunk_seq_lens[
                     i
                 ],
-                origin_context_lens=prefill_metadata.chunked_context.origin_context_lens,
-                cp_world_size=dcp_world_size,
+                local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks,
                 sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
-                chunk_size=prefill_metadata.chunked_context.chunk_size,
-                chunk_idx=i,
                 toks=toks,
             )
 
@@ -1960,7 +1946,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
 
-            # Pads the head_dim if necessary (for the underlying kernel)
             if self.q_pad_num_heads is not None:
                 B, N, L = decode_q_pe.shape
                 decode_pe_padded = decode_q_pe.new_empty((B, self.q_pad_num_heads, L))
@@ -1968,9 +1953,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 decode_pe_padded.copy_(decode_q_pe)
                 decode_q_pe = decode_pe_padded
 
-            if is_rocm_aiter_fp8bmm_enabled():
+            if self.is_aiter_triton_fp8_bmm_enabled:
                 # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
-                decode_ql_nope = aiter_triton_fp8_bmm(
+                decode_ql_nope = rocm_aiter_ops.triton_fp8_bmm(
                     decode_q_nope,
                     self.W_K,
                     self.W_K_scale,
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index a6aac701b784b..79b89c7890a28 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -41,10 +41,6 @@ class FlashAttnMLABackend(MLACommonBackend):
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["FlashAttnMLAMetadata"]:
-        return FlashAttnMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
         return FlashAttnMLAMetadataBuilder
@@ -81,7 +77,12 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
         device: torch.device,
     ):
         super().__init__(
-            kv_cache_spec, layer_names, vllm_config, device, FlashAttnMLAMetadata
+            kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+            FlashAttnMLAMetadata,
+            supports_dcp_with_varlen=True,
         )
         self.max_num_splits = 0  # No upper bound on the number of splits.
         self.fa_aot_schedule = get_flash_attn_version() == 3
@@ -92,13 +93,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
         self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
 
         if self.use_full_cuda_graph and self.fa_aot_schedule:
-            if self.max_cudagraph_size > 992:
-                # This condition derives from FA3's internal heuristic.
-                # TODO(woosuk): Support larger cudagraph sizes.
-                raise ValueError(
-                    "Capture size larger than 992 is not supported for full cuda graph."
-                )
-
             self.scheduler_metadata = torch.zeros(
                 vllm_config.scheduler_config.max_num_seqs + 1,
                 dtype=torch.int32,
@@ -163,6 +157,9 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             # we only set num_splits when using cuda graphs.
             max_num_splits = self.max_num_splits
 
+        if vllm_is_batch_invariant():
+            max_num_splits = 1
+
         scheduler_metadata = self._schedule_decode(
             num_reqs=seq_lens_cpu.numel(),
             cu_query_lens=query_start_loc_device,
@@ -188,9 +185,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
             self.scheduler_metadata[n:] = 0
             scheduler_metadata = self.scheduler_metadata[:n]
 
-        if vllm_is_batch_invariant():
-            max_num_splits = 1
-
         metadata = FlashAttnMLADecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens_device,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 1f98204031ed5..708bb9d63839d 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -40,10 +40,6 @@ class FlashMLABackend(MLACommonBackend):
     def get_name() -> str:
         return "FLASHMLA"
 
-    @staticmethod
-    def get_metadata_cls() -> type["FlashMLAMetadata"]:
-        return FlashMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
         return FlashMLAMetadataBuilder
@@ -71,7 +67,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
-    reorder_batch_threshold: int = 512  # process small prefills with decode pathway
+    reorder_batch_threshold: int = 128  # process small prefills with decode pathway
     # ^ TODO(matt): tune this
 
     def __init__(
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index bf8e4d5a62896..bf76549de1ce8 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -10,7 +10,6 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionLayer,
-    AttentionMetadata,
 )
 from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.flashmla import (
@@ -57,10 +56,6 @@ class FlashMLASparseBackend(AttentionBackend):
     def get_name() -> str:
         return "FLASHMLA_SPARSE"
 
-    @staticmethod
-    def get_metadata_cls() -> type[AttentionMetadata]:
-        return FlashMLASparseMetadata
-
     @staticmethod
     def get_builder_cls() -> type["FlashMLASparseMetadataBuilder"]:
         return FlashMLASparseMetadataBuilder
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 49009a939d0b5..f3c5bb7328712 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -7,7 +7,6 @@ import torch
 
 from vllm.attention.backends.abstract import (
     AttentionBackend,
-    AttentionMetadata,
     MultipleOf,
 )
 from vllm.config import VllmConfig
@@ -24,10 +23,6 @@ logger = init_logger(__name__)
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return DeepseekV32IndexerMetadata
-
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 128]
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 962cad927e6d5..5757aeadba056 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -6,9 +6,8 @@ from typing import ClassVar
 
 import torch
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import AttentionLayer
-from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
@@ -22,10 +21,6 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-def is_aiter_mla_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
-
-
 class AiterMLABackend(MLACommonBackend):
     @staticmethod
     def get_name() -> str:
@@ -35,10 +30,6 @@ class AiterMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["AiterMLAImpl"]:
         return AiterMLAImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AiterMLAMetadata"]:
-        return AiterMLAMetadata
-
     @staticmethod
     def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
         return AiterMLAMetadataBuilder
@@ -78,9 +69,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         super().__init__(
             kv_cache_spec, layer_names, vllm_config, device, AiterMLAMetadata
         )
-        assert self.kv_cache_spec.block_size == 1, (
-            "AITER MLAonly supports block size 1."
-        )
 
         self.compilation_config = vllm_config.compilation_config
         max_num_pages_per_req = cdiv(
@@ -94,6 +82,11 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            self.block_table_remapping = torch.zeros(
+                [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size],
+                dtype=torch.int32,
+                device=device,
+            )
             self.paged_kv_indptr = torch.zeros(
                 max_num_reqs + 1, dtype=torch.int32, device=device
             )
@@ -119,13 +112,29 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
-        block_table_bounds = (seq_lens_device + page_size - 1) // page_size
         device = self.device
         num_reqs = seq_lens_device.size(0)
+        bs, _ = block_table_tensor.shape
+        block_table_tensor = (
+            block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size
+        )
+        block_table_tensor = (
+            block_table_tensor
+            + torch.arange(
+                0,
+                page_size,
+                device=block_table_tensor.device,
+                dtype=block_table_tensor.dtype,
+            )[None, None, :]
+        )
+        block_table_tensor = block_table_tensor.view(bs, -1)
 
+        # after remapping, we assume the block size already equals to 1
+
+        max_blk_size_per_req = block_table_tensor.shape[-1]
         mask = torch.arange(
             block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device
-        ).unsqueeze(0) < block_table_bounds.unsqueeze(1)
+        ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
         paged_kv_last_page_len = seq_lens_device % page_size
@@ -135,13 +144,19 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
         paged_kv_indptr = torch.cat(
             [
-                torch.zeros(1, dtype=block_table_bounds.dtype, device=device),
-                block_table_bounds.cumsum(dim=0, dtype=torch.int32),
+                torch.zeros(1, dtype=seq_lens_device.dtype, device=device),
+                seq_lens_device.cumsum(dim=0, dtype=torch.int32),
             ]
         )
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
+            self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_(
+                block_table_tensor, non_blocking=True
+            )
+            block_table_tensor = self.block_table_remapping[
+                :num_reqs, :max_blk_size_per_req
+            ]
 
             self.paged_kv_indices[:num_actual_pages].copy_(
                 paged_kv_indices, non_blocking=True
@@ -264,7 +279,7 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
         # max_seqlen_qo must be 1 except for MTP
         # TODO: Find the best value for MTP
         max_seqlen_qo = 1
-        aiter_mla_decode_fwd(
+        rocm_aiter_ops.mla_decode_fwd(
             q,
             kv_buffer,
             o,
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 40a5517877967..525026bac5a7e 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -108,10 +108,6 @@ class PallasAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
         return PallasAttentionBackendImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["PallasMetadata"]:
-        return PallasMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index f7a4114a0a708..e8d3758a6395a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -9,227 +9,207 @@ import torch
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    split_decodes_prefills_and_extends,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 _PARTITION_SIZE_ROCM = 256
+_CP_TOKENS_PER_ITER_ROCM = 32 * 1024
 
 if current_platform.is_rocm():
     import aiter
+    from aiter.ops.triton.utils.device_info import get_num_sms
 
     from vllm.triton_utils import tl, triton
-    from vllm.utils.torch_utils import direct_register_custom_op
+
+    def block_size(x, head_dim):
+        return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
+
+    def num_programs(head_dim):
+        return min(head_dim, get_num_sms())
 
     @triton.jit
-    def _vllm_layout_trans_kernel(
-        k_buffer_ptr,
-        v_buffer_ptr,
-        k_values_ptr,
-        v_values_ptr,
-        b_query_lens_loc,
-        b_seq_lens_loc,
-        block_table,
-        block_table_stride_0,
-        k_scale,
-        v_scale,
-        output_dtype: tl.constexpr,
-        E_DIM: tl.constexpr,
+    def cp_mha_gather_cache_kernel(
+        key_cache_ptr,  # [num_blocks, page_size, num_head, head_size]
+        value_cache_ptr,  # [num_blocks, page_size, num_head, head_size]
+        key_ptr,  # [num_tokens, num_heads, head_size]
+        value_ptr,  # [num_tokens, num_heads, head_size]
+        block_table_ptr,  # [num_batches, max_block_num]
+        cu_seqlens_kv_ptr,  # [num_batches + 1]
+        token_to_batch_ptr,  # [max_cum_tokens]
+        seq_start_ptr,  # [num_batches]
+        k_scale_ptr,
+        v_scale_ptr,
+        num_heads,
+        head_size,
+        x,
+        max_block_num,
+        num_tokens,
+        DEQUANT: tl.constexpr,
+        PAGE_SIZE: tl.constexpr,
+        CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
+        NUM_PRGMS: tl.constexpr,
     ):
-        batch_idx = tl.program_id(0)
-        block_idx = tl.program_id(1)
+        bid = tl.program_id(0)
+        col_offsets = tl.arange(0, BLOCK_SIZE)
+        if DEQUANT:
+            k_scale = tl.load(k_scale_ptr)
+            v_scale = tl.load(v_scale_ptr)
 
-        batch_query_indexes = tl.load(b_query_lens_loc + batch_idx + tl.arange(0, 2))
-        batch_query_start, batch_query_end = tl.split(batch_query_indexes)
-        query_len = batch_query_end - batch_query_start
-
-        if query_len <= 1:
-            return
-
-        batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx + tl.arange(0, 2))
-        batch_token_start, batch_token_end = tl.split(batch_token_indexes)
-        seq_len = batch_token_end - batch_token_start
-
-        if block_idx * BLOCK_SIZE < seq_len:
-            block_mask = (
-                block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)[:, None]
-            ) < seq_len
-
-            kv_idx = tl.load(
-                block_table + batch_idx * block_table_stride_0 + block_idx
-            ).to(tl.int64)
-
-            kv_buffer_off = (
-                kv_idx * BLOCK_SIZE * E_DIM
-                + tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM
-                + tl.arange(0, E_DIM)[None, :]
+        for token_id in tl.range(bid, num_tokens, NUM_PRGMS):
+            key_ptr_offset = key_ptr + token_id * head_size * num_heads
+            value_ptr_offset = value_ptr + token_id * head_size * num_heads
+            batch_idx = tl.load(token_to_batch_ptr + token_id)
+            batch_start = tl.load(seq_start_ptr + batch_idx)
+            token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
+            batch_offset = token_id - token_start + batch_start
+            block_offset = batch_offset // PAGE_SIZE
+            block_id = tl.load(
+                block_table_ptr + max_block_num * batch_idx + block_offset
             )
-            k_vals = tl.load(k_buffer_ptr + kv_buffer_off, mask=block_mask, other=0.0)
-            if k_vals.dtype.is_fp8():
-                k_vals = (k_vals.to(tl.float32) * tl.load(k_scale)).to(output_dtype)
-            else:
-                k_vals = k_vals.to(output_dtype)
+            slot_id = batch_offset % PAGE_SIZE
 
-            v_vals = tl.load(v_buffer_ptr + kv_buffer_off, mask=block_mask, other=0.0)
-            if v_vals.dtype.is_fp8():
-                v_vals = (v_vals.to(tl.float32) * tl.load(v_scale)).to(output_dtype)
-            else:
-                v_vals = v_vals.to(output_dtype)
-            kv_values_off = (
-                batch_token_start * E_DIM
-                + block_idx * BLOCK_SIZE * E_DIM
-                + tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM
-                + tl.arange(0, E_DIM)[None, :]
-            )
-            tl.store(k_values_ptr + kv_values_off, k_vals, mask=block_mask)
-            tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
+            if CACHE_FORMAT == "NHD":
+                # for kv cache layout as
+                # K: [num_blocks, page_size, num_head, head_dim]
+                # V: [num_blocks, page_size, num_head, head_dim]
+                key_cache_ptr_offset = (
+                    key_cache_ptr
+                    + block_id * num_heads * head_size * PAGE_SIZE
+                    + slot_id * num_heads * head_size
+                )
+                value_cache_ptr_offset = (
+                    value_cache_ptr
+                    + block_id * num_heads * head_size * PAGE_SIZE
+                    + slot_id * num_heads * head_size
+                )
 
-    def vllm_layout_trans(
-        b_query_lens_loc,
-        b_seq_lens_loc,
-        block_table,
-        k_cache,
-        v_cache,
-        max_seq_len,
-        k_scale,
-        v_scale,
-        output_dtype,
-        total_tokens,
+                for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
+                    mask = (col_offsets + i) < head_size * num_heads
+                    k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
+                    v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
+                    if DEQUANT:
+                        k_dtype = k_reg.dtype
+                        v_dtype = v_reg.dtype
+                        k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
+                        v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
+                    tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
+                    tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
+
+    def cp_mha_gather_cache(
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        block_tables: torch.Tensor,
+        k_scales: torch.Tensor,
+        v_scales: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        token_to_batch: torch.Tensor,
+        seq_starts: torch.Tensor,
+        dequant: bool,
+        kv_cache_layout: str,
+        total_tokens: int,
     ):
-        H_KV = v_cache.shape[2]
-        D = v_cache.shape[3]
-        BLOCK_SIZE = v_cache.shape[1]
-
-        k_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=k_cache.device,
+        assert kv_cache_layout in ["v0", "NHD", "HND"], (
+            "kv_cache_layout only support v0, NHD, HND"
         )
-        v_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=v_cache.device,
+        head_dim = key.shape[2]
+        x = 0
+        # assert dequant is True, "Currently, we only support "\
+        # "gather cache with dequant"
+        # For k cache layout: [num_blocks, num_heads, page_size, head_dim]
+        assert kv_cache_layout == "NHD", (
+            "ROCM_AITER_FA_BACKEND Only support NHD kv cache layout for now"
         )
-
-        grid = (block_table.shape[0], (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
-
-        if output_dtype == torch.float16:
-            output_dtype = tl.float16
-        elif output_dtype == torch.bfloat16:
-            output_dtype = tl.bfloat16
-        else:
-            raise ValueError(f"Unsupported output dtype: {output_dtype}")
-
-        _vllm_layout_trans_kernel[grid](
-            k_cache,
-            v_cache,
-            k_values,
-            v_values,
-            b_query_lens_loc,
-            b_seq_lens_loc,
-            block_table,
-            block_table.stride(0),
-            k_scale,
-            v_scale,
-            output_dtype=output_dtype,
-            E_DIM=H_KV * D,
-            BLOCK_SIZE=BLOCK_SIZE,
+        assert head_dim == key_cache.shape[3], (
+            "We assume your kv cache layout is [num_blocks, "
+            "page_size, num_heads, head_dim], but got otherwise"
         )
+        page_size = key_cache.shape[1]
+        num_heads = key_cache.shape[2]
 
-        return k_values, v_values
-
-    def flash_attn_varlen_func_impl(
-        q: torch.Tensor,
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        out: torch.Tensor,
-        cu_seqlens_q: torch.Tensor,
-        cu_seqlens_k: torch.Tensor,
-        max_seqlen_q: int,
-        max_seqlen_k: int,
-        softmax_scale: float,
-        window_size: list[int] | None,  # -1 means infinite context window
-        alibi_slopes: list[float] | None,
-        block_table: torch.Tensor,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        total_tokens: int = 0,
-    ) -> torch.Tensor:
-        if total_tokens == 0:
-            total_tokens = int(cu_seqlens_k[-1].item())
-        k, v = vllm_layout_trans(
-            cu_seqlens_q,
-            cu_seqlens_k,
-            block_table,
-            k_cache,
-            v_cache,
-            max_seqlen_k,
-            k_scale,
-            v_scale,
-            q.dtype,
+        NUM_PRGMS = num_programs(total_tokens)
+        BLOCK_SIZE = block_size(key_cache, head_dim)
+        grid = lambda meta: (NUM_PRGMS,)
+        cp_mha_gather_cache_kernel[grid](
+            key_cache,
+            value_cache,
+            key,
+            value,
+            block_tables,
+            cu_seqlens_kv,
+            token_to_batch,
+            seq_starts,
+            k_scales,
+            v_scales,
+            num_heads,
+            head_dim,
+            x,
+            block_tables.size(1),
             total_tokens,
+            DEQUANT=dequant,
+            PAGE_SIZE=page_size,
+            CACHE_FORMAT=kv_cache_layout,
+            BLOCK_SIZE=BLOCK_SIZE,
+            NUM_PRGMS=NUM_PRGMS,
         )
 
-        output = aiter.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=cu_seqlens_q,
-            max_seqlen_q=max_seqlen_q,
-            min_seqlen_q=1,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_k=max_seqlen_k,
-            softmax_scale=softmax_scale,
-            causal=True,
-            alibi_slopes=alibi_slopes,
-            window_size=window_size,
-            out=out,
-        )
-        return output
-
-    def flash_attn_varlen_func_fake(
-        q: torch.Tensor,
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        out: torch.Tensor,
-        cu_seqlens_q: torch.Tensor,
-        cu_seqlens_k: torch.Tensor,
-        max_seqlen_q: int,
-        max_seqlen_k: int,
-        softmax_scale: float,
-        window_size: list[int] | None,  # -1 means infinite context window
-        alibi_slopes: list[float] | None,
-        block_table: torch.Tensor,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        total_tokens: int = 0,
-    ) -> torch.Tensor:
-        return torch.empty(
-            q.shape[0], q.shape[1], v_cache.shape[-2], dtype=q.dtype, device=q.device
-        )
-
-    direct_register_custom_op(
-        "flash_attn_varlen_func",
-        flash_attn_varlen_func_impl,
-        ["out"],
-        flash_attn_varlen_func_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class AiterFlashAttentionDecodeMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
+
+
+@dataclass
+class AiterFlashAttentionPrefillMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
+
+
+@dataclass
+class AiterChunkContextMetadata:
+    workspace: torch.Tensor
+    cu_seq_lens_chunk: torch.Tensor
+    chunk_starts: torch.Tensor
+    token_to_batch: torch.Tensor
+    seq_tot: list[int]
+    max_seq_lens: list[int]
+    seq_lens: torch.Tensor
+    num_chunks: int
+    total_token_per_batch: list[int]
+
+
+@dataclass
+class AiterFlashAttentionChunkPrefillMetadata:
+    max_query_len: int
+    min_query_len: int
+    max_seq_len: int
+    query_start_loc: torch.Tensor
+    chunk_context_metadata: AiterChunkContextMetadata
+
+
 @dataclass
 class AiterFlashAttentionMetadata:
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
@@ -248,7 +228,18 @@ class AiterFlashAttentionMetadata:
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
-    cu_seq_lens: torch.Tensor | None
+
+    # prefill and deocde split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+    num_extends: int
+    num_extend_tokens: int
+
+    decode_metadata: AiterFlashAttentionDecodeMetadata | None
+    prefill_metadata: AiterFlashAttentionPrefillMetadata | None
+    extend_metadata: AiterFlashAttentionChunkPrefillMetadata | None
 
     # For cascade attention.
     use_cascade: bool
@@ -260,6 +251,7 @@ class AiterFlashAttentionMetadataBuilder(
     AttentionMetadataBuilder[AiterFlashAttentionMetadata]
 ):
     cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -285,6 +277,12 @@ class AiterFlashAttentionMetadataBuilder(
         self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
+        self.extend_workspace = torch.empty(
+            [2, _CP_TOKENS_PER_ITER_ROCM, self.num_heads_kv, self.headdim],
+            dtype=self.model_config.dtype,
+            device=device,
+        )
+
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ):
@@ -302,42 +300,139 @@ class AiterFlashAttentionMetadataBuilder(
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> "AiterFlashAttentionMetadata":
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.max_seq_len
-        query_start_loc = common_attn_metadata.query_start_loc
-        seq_lens = common_attn_metadata.seq_lens
-        block_table_tensor = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-        if max_query_len > 1:
-            # We pre-compute cumulative seq len needed for prefill attention
-            # here to avoid recomputing it for every layer
-            cu_seq_lens = torch.zeros(
-                seq_lens.shape[0] + 1, dtype=torch.int32, device=seq_lens.device
-            )
-            torch.cumsum(seq_lens, dim=0, dtype=cu_seq_lens.dtype, out=cu_seq_lens[1:])
-            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
-        else:
-            cu_seq_lens = None
-            num_actual_kv_tokens = 0
+        split_ret = split_decodes_prefills_and_extends(
+            common_attn_metadata,
+            decode_threshold=self.reorder_batch_threshold,
+        )
 
-        def schedule(
-            batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal
-        ):
-            return None
+        (
+            num_decodes,
+            num_extends,
+            num_prefills,
+            num_decode_tokens,
+            num_extend_tokens,
+            num_prefill_tokens,
+        ) = split_ret
+
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+
+        seq_lens = common_attn_metadata.seq_lens_cpu
+
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+
+        decode_metadata = None
+        if num_decodes > 0:
+            decode_metadata = AiterFlashAttentionDecodeMetadata(
+                max_query_len=query_lens_cpu[:num_decodes].max().item(),
+                min_query_len=query_lens_cpu[:num_decodes].min().item(),
+                max_seq_len=seq_lens[:num_decodes].max().item(),
+                query_start_loc=common_attn_metadata.query_start_loc[: num_decodes + 1],
+            )
+
+        prefill_metadata = None
+        if num_prefills > 0:
+            query_lens_for_prefill = query_lens_cpu[num_decodes + num_extends :]
+            query_start_loc_device = common_attn_metadata.query_start_loc[
+                num_decodes + num_extends :
+            ]
+            prefill_metadata = AiterFlashAttentionPrefillMetadata(
+                max_query_len=query_lens_for_prefill.max().item(),
+                min_query_len=query_lens_for_prefill.min().item(),
+                max_seq_len=seq_lens[num_decodes + num_extends :].max().item(),
+                query_start_loc=query_start_loc_device - query_start_loc_device[0],
+            )
+
+        extend_metadata = None
+        if num_extends > 0:
+            num_extends_slice = slice(num_decodes, num_decodes + num_extends)
+            query_lens_for_extend = query_lens_cpu[num_extends_slice]
+            seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
+            computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
+
+            # allocate the equal amount of workspace for
+            # each chunk prefill request
+            max_context_chunk = _CP_TOKENS_PER_ITER_ROCM // num_extends
+            num_chunks = cdiv(computed_kv_lens.max().item(), max_context_chunk)
+
+            chunk_starts = (
+                torch.arange(num_chunks, dtype=torch.int32)
+                .unsqueeze(1)
+                .expand(-1, num_extends)
+                * max_context_chunk
+            )
+            chunk_ends = torch.min(
+                computed_kv_lens.unsqueeze(0), chunk_starts + max_context_chunk
+            )
+            chunk_seq_lens = (chunk_ends - chunk_starts).clamp(
+                min=0
+            )  # [num_chunks, num_extends]
+            cu_seq_lens_cpu = torch.zeros(
+                [num_chunks, num_extends + 1], dtype=torch.int32, pin_memory=True
+            )
+            torch.cumsum(
+                chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
+            )
+            max_cum_tokens = cu_seq_lens_cpu[:, -1].max().item()
+
+            range_idx = torch.arange(max_cum_tokens, dtype=torch.int32)[None, None, :]
+            idx_to_batch_tensor = range_idx == cu_seq_lens_cpu[:, 1:][:, :, None]
+            idx_to_batch_tensor = idx_to_batch_tensor.sum(
+                dim=1
+            )  # [num_chunks, max_cum_tokens]
+            token_to_batch_tensor = torch.cumsum(idx_to_batch_tensor, dim=1)
+
+            chunk_context_metadata = AiterChunkContextMetadata(
+                workspace=self.extend_workspace,
+                cu_seq_lens_chunk=cu_seq_lens_cpu.to(self.device, non_blocking=True),
+                chunk_starts=chunk_starts.to(self.device, non_blocking=True),
+                seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                seq_lens=chunk_seq_lens,
+                token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
+                num_chunks=num_chunks,
+                total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
+            )
+
+            query_start_loc_device = common_attn_metadata.query_start_loc[
+                num_decodes : num_decodes + num_extends + 1
+            ]
+            seq_lens_device = common_attn_metadata.seq_lens[num_extends_slice]
+            cu_seq_lens = torch.zeros(
+                num_extends + 1, dtype=torch.int32, device=seq_lens_device.device
+            )
+            torch.cumsum(
+                seq_lens_device, dim=0, dtype=cu_seq_lens.dtype, out=cu_seq_lens[1:]
+            )
+            extend_metadata = AiterFlashAttentionChunkPrefillMetadata(
+                max_query_len=query_lens_for_extend.max().item(),
+                min_query_len=query_lens_for_extend.min().item(),
+                max_seq_len=seq_lens[num_extends_slice].max().item(),
+                query_start_loc=query_start_loc_device - query_start_loc_device[0],
+                chunk_context_metadata=chunk_context_metadata,
+            )
+
+        num_actual_kv_tokens = torch.sum(seq_lens).item()
 
         use_cascade = common_prefix_len > 0
 
         attn_metadata = AiterFlashAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
             num_actual_kv_tokens=num_actual_kv_tokens,
-            max_query_len=max_query_len,
-            query_start_loc=query_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table_tensor,
-            slot_mapping=slot_mapping,
-            cu_seq_lens=cu_seq_lens,
+            max_query_len=common_attn_metadata.max_query_len,
+            query_start_loc=common_attn_metadata.query_start_loc,
+            max_seq_len=common_attn_metadata.max_seq_len,
+            seq_lens=common_attn_metadata.seq_lens,
+            block_table=common_attn_metadata.block_table_tensor,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_extends=num_extends,
+            num_extend_tokens=num_extend_tokens,
+            decode_metadata=decode_metadata,
+            prefill_metadata=prefill_metadata,
+            extend_metadata=extend_metadata,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             total_tokens=self.total_tokens,
@@ -383,10 +478,6 @@ class AiterFlashAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["AiterFlashAttentionImpl"]:
         return AiterFlashAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return AiterFlashAttentionMetadata
-
     @staticmethod
     def get_builder_cls() -> type["AiterFlashAttentionMetadataBuilder"]:
         return AiterFlashAttentionMetadataBuilder
@@ -401,6 +492,7 @@ class AiterFlashAttentionBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
+
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
 
@@ -449,6 +541,110 @@ class AiterFlashAttentionImpl(AttentionImpl):
                 "FlashAttentionImpl"
             )
 
+    def extend_forward(
+        self,
+        attn_metadata: AiterFlashAttentionMetadata,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        output: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        min_seqlen_q: int,
+        block_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        k_scale: float,
+        v_scale: float,
+    ):
+        out, lse = aiter.flash_attn_varlen_func(
+            q=query,
+            k=key,
+            v=value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_q,
+            min_seqlen_q=min_seqlen_q,
+            dropout_p=0.0,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            return_lse=True,
+        )
+        assert attn_metadata.extend_metadata is not None
+        chunk_context_metadata = attn_metadata.extend_metadata.chunk_context_metadata
+        num_chunks = chunk_context_metadata.num_chunks
+        workspace = chunk_context_metadata.workspace
+        cu_seqlens_kv = chunk_context_metadata.cu_seq_lens_chunk
+        max_seqlens = chunk_context_metadata.max_seq_lens
+        chunk_starts = chunk_context_metadata.chunk_starts
+        token_to_batch = chunk_context_metadata.token_to_batch
+        total_token_per_batch = chunk_context_metadata.total_token_per_batch
+        key_fetched, value_fetched = workspace[0], workspace[1]
+        chunked_output = None
+        chunked_lse = None
+        for chunk_idx in range(num_chunks):
+            cp_mha_gather_cache(
+                key_cache=key_cache,
+                value_cache=value_cache,
+                key=key_fetched,
+                value=value_fetched,
+                block_tables=block_table,
+                k_scales=k_scale,
+                v_scales=v_scale,
+                cu_seqlens_kv=cu_seqlens_kv[chunk_idx],
+                token_to_batch=token_to_batch[chunk_idx],
+                seq_starts=chunk_starts[chunk_idx],
+                dequant=False,
+                kv_cache_layout="NHD",
+                total_tokens=total_token_per_batch[chunk_idx],
+            )
+
+            suf_out, suf_lse = aiter.flash_attn_varlen_func(
+                q=query,
+                k=key_fetched,
+                v=value_fetched,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_kv[chunk_idx],
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlens[chunk_idx],
+                min_seqlen_q=min_seqlen_q,
+                dropout_p=0.0,
+                softmax_scale=self.scale,
+                causal=False,
+                window_size=self.sliding_window,
+                alibi_slopes=self.alibi_slopes,
+                return_lse=True,
+            )
+            if chunked_output is None:
+                chunked_output = suf_out
+                chunked_lse = suf_lse
+            else:
+                tmp_output = torch.empty_like(out)
+                tmp_lse = torch.empty_like(lse)
+                merge_attn_states(
+                    output=tmp_output,
+                    output_lse=tmp_lse,
+                    prefix_output=chunked_output,
+                    prefix_lse=chunked_lse,
+                    suffix_output=suf_out,
+                    suffix_lse=suf_lse,
+                )
+                chunked_output = tmp_output
+                chunked_lse = tmp_lse
+
+        merge_attn_states(
+            output=output,
+            prefix_output=chunked_output,
+            prefix_lse=chunked_lse,
+            suffix_output=out,
+            suffix_lse=lse,
+        )
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -488,24 +684,25 @@ class AiterFlashAttentionImpl(AttentionImpl):
             return output.fill_(0)
 
         # IMPORTANT!
-        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
-        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
-        # in this method. For example, `view` and `slice` (or `[:n]`) operations
-        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is
+        # executed in eager-mode PyTorch. Thus, we need to be careful
+        # about any CPU overhead in this method. For example, `view`
+        # and `slice` (or `[:n]`) operations are surprisingly slow even
+        # in the case they do not invoke any GPU ops.
         # Minimize the PyTorch ops in this method as much as possible.
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
-
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(0)
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping
+            # is not padded. However, we don't need to do
+            # key[:num_actual_tokens] and value[:num_actual_tokens] because
+            # the reshape_and_cache_flash op uses the slot_mapping's shape
+            # to determine the number of actual tokens.
+
             torch.ops._C_cache_ops.reshape_and_cache_flash(
                 key,
                 value,
@@ -521,69 +718,118 @@ class AiterFlashAttentionImpl(AttentionImpl):
             key_cache = key_cache.view(current_platform.fp8_dtype())
             value_cache = value_cache.view(current_platform.fp8_dtype())
 
-        if not attn_metadata.use_cascade:
-            cu_seqlens_q = attn_metadata.query_start_loc
-            seqused_k = attn_metadata.seq_lens
-            max_seqlen_q = attn_metadata.max_query_len
-            max_seqlen_k = attn_metadata.max_seq_len
-            block_table = attn_metadata.block_table
+        # decode:extend:prefill
+        query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
 
-            if max_seqlen_q > 1:
-                torch.ops.vllm.flash_attn_varlen_func(
-                    query[:num_actual_tokens],
-                    key_cache,
-                    value_cache,
-                    out=output[:num_actual_tokens],
-                    cu_seqlens_q=cu_seqlens_q,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_k=max_seqlen_k,
+        output_actual_tokens = output[:num_actual_tokens]
+
+        num_decodes = attn_metadata.num_decodes
+        num_prefills = attn_metadata.num_prefills
+        num_extends = attn_metadata.num_extends
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_extend_tokens = attn_metadata.num_extend_tokens
+        if not attn_metadata.use_cascade:
+            # calculate for pure prefills
+            if num_prefills > 0:
+                assert attn_metadata.prefill_metadata is not None
+
+                prefill_query = query[num_decode_tokens + num_extend_tokens :]
+                prefill_key = key[num_decode_tokens + num_extend_tokens :]
+                prefill_value = value[num_decode_tokens + num_extend_tokens :]
+
+                aiter.flash_attn_varlen_func(
+                    q=prefill_query,
+                    k=prefill_key,
+                    v=prefill_value,
+                    cu_seqlens_q=attn_metadata.prefill_metadata.query_start_loc,
+                    cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc,
+                    max_seqlen_q=attn_metadata.prefill_metadata.max_query_len,
+                    max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len,
+                    min_seqlen_q=attn_metadata.prefill_metadata.min_query_len,
+                    dropout_p=0.0,
                     softmax_scale=self.scale,
-                    alibi_slopes=self.alibi_slopes,
+                    causal=True,
                     window_size=self.sliding_window,
-                    block_table=block_table,
-                    cu_seqlens_k=attn_metadata.cu_seq_lens,
-                    k_scale=layer._k_scale,
-                    v_scale=layer._v_scale,
-                    total_tokens=attn_metadata.num_actual_kv_tokens,
+                    alibi_slopes=self.alibi_slopes,
+                    out=output_actual_tokens[num_decode_tokens + num_extend_tokens :],
                 )
 
-            _, num_heads, head_size = query.shape
-            nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
-            num_seqs = seqused_k.shape[0]
-            max_num_partitions = (
-                max_seqlen_k + _PARTITION_SIZE_ROCM - 1
-            ) // _PARTITION_SIZE_ROCM
+            # calculate for extends
+            if num_extends > 0:
+                assert attn_metadata.extend_metadata is not None
+                extend_tokens_slice = slice(
+                    num_decode_tokens, num_decode_tokens + num_extend_tokens
+                )
+                extend_querys = query[extend_tokens_slice]
+                extend_keys = key[extend_tokens_slice]
+                extend_values = value[extend_tokens_slice]
+                extend_outputs = output[extend_tokens_slice]
+                self.extend_forward(
+                    attn_metadata=attn_metadata,
+                    query=extend_querys,
+                    key=extend_keys,
+                    value=extend_values,
+                    key_cache=key_cache,
+                    value_cache=value_cache,
+                    output=extend_outputs,
+                    cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
+                    max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
+                    max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
+                    min_seqlen_q=attn_metadata.extend_metadata.min_query_len,
+                    block_table=attn_metadata.block_table[
+                        num_decodes : num_decodes + num_extends
+                    ],
+                    slot_mapping=attn_metadata.slot_mapping[
+                        num_decodes : num_decodes + num_extends
+                    ],
+                    k_scale=layer._k_scale,
+                    v_scale=layer._v_scale,
+                )
 
-            workspace_buffer = torch.empty(
-                (num_seqs * num_heads * max_num_partitions * head_size)
-                * nbytes_per_qo_elem
-                + 2 * (num_seqs * num_heads * max_num_partitions) * 4,
-                dtype=torch.uint8,
-                device=output.device,
-            )
+            # calculate for decodes
+            if num_decodes > 0:
+                assert attn_metadata.decode_metadata is not None
+                _, num_heads, head_size = query.shape
+                nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
+                num_seqs = attn_metadata.seq_lens.shape[0]
+                max_num_partitions = (
+                    attn_metadata.max_seq_len + _PARTITION_SIZE_ROCM - 1
+                ) // _PARTITION_SIZE_ROCM
 
-            torch.ops.aiter.paged_attention_v1(
-                output[:num_actual_tokens],
-                workspace_buffer,
-                query[:num_actual_tokens],
-                key_cache,
-                value_cache,
-                self.scale,
-                block_table,
-                cu_seqlens_q,
-                seqused_k,
-                max_seqlen_k,
-                self.alibi_slopes,
-                self.kv_cache_dtype,
-                "NHD",
-                self.logits_soft_cap,
-                layer._k_scale,
-                layer._v_scale,
-                None,
-                _PARTITION_SIZE_ROCM,
-            )
-            return output
+                workspace_buffer = torch.empty(
+                    (num_seqs * num_heads * max_num_partitions * head_size)
+                    * nbytes_per_qo_elem
+                    + 2 * (num_seqs * num_heads * max_num_partitions) * 4,
+                    dtype=torch.uint8,
+                    device=output.device,
+                )
+
+                torch.ops.aiter.paged_attention_v1(
+                    output[:num_decode_tokens],
+                    workspace_buffer,
+                    query[:num_decode_tokens],
+                    key_cache,
+                    value_cache,
+                    self.scale,
+                    attn_metadata.block_table[:num_decodes],
+                    attn_metadata.query_start_loc[:num_decodes],
+                    attn_metadata.seq_lens[:num_decodes],
+                    attn_metadata.max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    "NHD",
+                    self.logits_soft_cap,
+                    layer._k_scale,
+                    layer._v_scale,
+                    None,
+                    _PARTITION_SIZE_ROCM,
+                )
         else:
             raise NotImplementedError(
                 "Cascade attention is not implemented for ROCM AITER"
             )
+
+        return output
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 27b072106268b..b2639c0df0412 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import AttentionMetadata, AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
@@ -15,7 +15,6 @@ from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.rocm_attn import (
     RocmAttentionBackend,
     RocmAttentionImpl,
-    RocmAttentionMetadata,
     RocmAttentionMetadataBuilder,
 )
 
@@ -33,10 +32,6 @@ class RocmAiterUnifiedAttentionBackend(RocmAttentionBackend):
     def get_impl_cls() -> type["RocmAiterUnifiedAttentionImpl"]:
         return RocmAiterUnifiedAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return RocmAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 8b7ce90a3ccae..57ba4dc78d9fd 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -10,7 +10,6 @@ import torch
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
 )
 from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
@@ -182,10 +181,6 @@ class RocmAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["RocmAttentionImpl"]:
         return RocmAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return RocmAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index ee6ead9ad9b35..0c0222d6152fb 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -12,7 +12,6 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     MultipleOf,
 )
@@ -64,10 +63,6 @@ class TreeAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["TreeAttentionImpl"]:
         return TreeAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TreeAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index b1d34dbfd1729..0590a87bf8e5f 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -10,7 +10,6 @@ import torch
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     MultipleOf,
 )
@@ -176,10 +175,6 @@ class TritonAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["TritonAttentionImpl"]:
         return TritonAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TritonAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 07d62e9849e00..07dfbc766acd1 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -264,7 +264,10 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         self.device = device
 
     def _init_reorder_batch_threshold(
-        self, reorder_batch_threshold: int = 1, supports_spec_as_decode: bool = False
+        self,
+        reorder_batch_threshold: int = 1,
+        supports_spec_as_decode: bool = False,
+        supports_dcp_with_varlen: bool = False,
     ) -> None:
         self.reorder_batch_threshold = reorder_batch_threshold
         if self.reorder_batch_threshold is not None and supports_spec_as_decode:
@@ -281,6 +284,12 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
                     1 + speculative_config.num_speculative_tokens,
                 )
 
+        if (
+            self.vllm_config.parallel_config.decode_context_parallel_size > 1
+            and not supports_dcp_with_varlen
+        ):
+            self.reorder_batch_threshold = 1
+
     @abstractmethod
     def build(
         self,
@@ -728,6 +737,73 @@ def subclass_attention_backend(
     )
 
 
+def split_decodes_prefills_and_extends(
+    common_attn_metadata: CommonAttentionMetadata,
+    decode_threshold: int = 1,
+) -> tuple[int, int, int, int, int, int]:
+    """
+    Assuming a reordered batch, finds the boundary between prefill and decode
+    requests.
+
+    Args:
+        common_attn_metadata: CommonAttentionMetadata object containing the
+            batch metadata.
+        decode_threshold: The maximum query length to be considered a decode.
+
+    Returns:
+        num_decodes: The number of decode requests.
+        num_extends: The number of extend requests.
+        num_prefills: The number of prefill requests.
+        num_decode_tokens: The number of tokens in the decode requests.
+        num_extend_tokens: The number of tokens in the extend requests.
+        num_prefill_tokens: The number of tokens in the prefill requests.
+    """
+    max_query_len = common_attn_metadata.max_query_len
+    num_reqs = common_attn_metadata.num_reqs
+    num_tokens = common_attn_metadata.num_actual_tokens
+    query_start_loc = common_attn_metadata.query_start_loc_cpu
+    seq_lens = common_attn_metadata.seq_lens_cpu
+
+    if max_query_len <= decode_threshold:
+        return num_reqs, 0, 0, num_tokens, 0, 0
+
+    query_lens = query_start_loc[1:] - query_start_loc[:-1]
+    is_prefill_or_extend = query_lens > decode_threshold
+    is_prefill = (seq_lens == query_lens) & is_prefill_or_extend
+    first_extend = is_prefill_or_extend.int().argmax(dim=-1).item()
+    first_prefill = is_prefill.int().argmax(dim=-1).item()
+    num_decodes = first_extend
+    num_decode_tokens = query_start_loc[first_extend].item()
+    if not torch.any(is_prefill_or_extend):
+        return (num_decodes, 0, 0, num_decode_tokens, 0, 0)
+
+    num_prefills_or_extends = num_reqs - num_decodes
+    num_prefill_or_extend_tokens = num_tokens - num_decode_tokens
+    if not torch.any(is_prefill):
+        return (
+            num_decodes,
+            num_prefills_or_extends,
+            0,
+            num_decode_tokens,
+            num_prefill_or_extend_tokens,
+            0,
+        )
+
+    num_extends = first_prefill - num_decodes
+    num_prefills = num_reqs - first_prefill
+
+    num_prefill_tokens = num_tokens - query_start_loc[first_prefill]
+    num_extend_tokens = num_prefill_or_extend_tokens - num_prefill_tokens
+    return (
+        num_decodes,
+        num_extends,
+        num_prefills,
+        num_decode_tokens,
+        num_extend_tokens,
+        num_prefill_tokens,
+    )
+
+
 def split_decodes_and_prefills(
     common_attn_metadata: CommonAttentionMetadata,
     decode_threshold: int = 1,
@@ -1000,3 +1076,41 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
         nums_dict[BLOCK_M]["token_chunk_offset_ptr"] = token_chunk_offset_ptr  # type: ignore
 
     return nums_dict, batch_ptr, token_chunk_offset_ptr
+
+
+def get_dcp_local_seq_lens(
+    seq_lens: torch.Tensor,
+    dcp_world_size: int = 1,
+    dcp_rank: int | None = None,
+    dcp_kv_cache_interleave_size: int = 1,
+) -> torch.Tensor:
+    """While using dcp, kv_cache size stored on each rank may be different,
+    use this function to calculate split decode seq_lens of each dcp rank.
+    Only consider dcp now, we can extend the case of cp based on this.
+    """
+    num_requests = seq_lens.size(0)
+    if dcp_rank is None:
+        rank_offsets = (
+            torch.arange(dcp_world_size, dtype=torch.int32)
+            .unsqueeze(0)
+            .repeat(num_requests, 1)
+        )
+    else:
+        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+    seq_lens_tiled = (
+        seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
+    )
+    base = (
+        seq_lens_tiled
+        // dcp_kv_cache_interleave_size
+        // dcp_world_size
+        * dcp_kv_cache_interleave_size
+    )
+    remainder = seq_lens_tiled - base * dcp_world_size
+    remainder = torch.clip(
+        remainder - rank_offsets * dcp_kv_cache_interleave_size,
+        0,
+        dcp_kv_cache_interleave_size,
+    )
+    dcp_local_seq_lens = base + remainder
+    return dcp_local_seq_lens.squeeze(1)
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 457b15ebdd82f..81bdbd641429a 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -10,7 +10,6 @@ import torch
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
-    AttentionMetadata,
     AttentionType,
     MultipleOf,
 )
@@ -105,10 +104,6 @@ class XFormersAttentionBackend(AttentionBackend):
     def get_impl_cls() -> type["XFormersAttentionImpl"]:
         return XFormersAttentionImpl
 
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return XFormersAttentionMetadata
-
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 15c06a0b107d8..55710ad5cc693 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -259,7 +259,9 @@ class BlockPool:
                         num_cached_blocks * block_size : num_full_blocks * block_size
                     ],
                     block_size=block_size,
-                    lora_id=request.lora_request.id if request.lora_request else None,
+                    lora_id=request.lora_request.adapter_id
+                    if request.lora_request
+                    else None,
                     medium=MEDIUM_GPU,
                 )
             )
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 291d33c9bf989..88d99d9402821 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -4,16 +4,34 @@ from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional
 
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
     from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
     from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.metrics.stats import SchedulerStats
     from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
     from vllm.v1.request import Request, RequestStatus
+    from vllm.v1.structured_output import StructuredOutputManager
 
 
 class SchedulerInterface(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        kv_cache_config: "KVCacheConfig",
+        structured_output_manager: "StructuredOutputManager",
+        block_size: int,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
+    ) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def schedule(self) -> "SchedulerOutput":
         """Schedule the requests to process in this scheduling step.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index f51744eb2640b..46dc1071b8395 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import copy
 import itertools
 import time
 from collections import defaultdict
@@ -39,6 +38,7 @@ from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import record_function_or_nullcontext
 
 logger = init_logger(__name__)
 
@@ -92,15 +92,10 @@ class Scheduler(SchedulerInterface):
             assert not self.is_encoder_decoder, (
                 "Encoder-decoder models are not currently supported with KV connectors"
             )
-
-            connector_vllm_config = copy.copy(self.vllm_config)
-
-            # We're dynamically inserting a kv_cache_config variable into the
-            # connector_vllm_config. This is distinct from the cache_config
-            # that is already in there.
-            connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)  # type: ignore[attr-defined]
             self.connector = KVConnectorFactory.create_connector(
-                config=connector_vllm_config, role=KVConnectorRole.SCHEDULER
+                config=self.vllm_config,
+                role=KVConnectorRole.SCHEDULER,
+                kv_cache_config=self.kv_cache_config,
             )
             if self.log_stats:
                 self.connector_prefix_cache_stats = PrefixCacheStats()
@@ -223,10 +218,14 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            # Make sure the input position does not exceed the max model len.
-            # This is necessary when using spec decoding.
+            # Make sure the input position does not exceed the max model len or
+            # request's max_tokens.
+            # This is necessary when using spec decoding and/or async scheduling.
+            max_total_tokens = min(
+                request.num_prompt_tokens + request.max_tokens, self.max_model_len
+            )
             num_new_tokens = min(
-                num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens
+                num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
             )
 
             # Schedule encoder inputs.
@@ -261,49 +260,52 @@ class Scheduler(SchedulerInterface):
                 continue
 
             # Schedule newly needed KV blocks for the request.
-            while True:
-                new_blocks = self.kv_cache_manager.allocate_slots(
-                    request,
-                    num_new_tokens,
-                    num_lookahead_tokens=self.num_lookahead_tokens,
-                )
-
-                if new_blocks is not None:
-                    # The request can be scheduled.
-                    break
-
-                # The request cannot be scheduled.
-                # Preempt the lowest-priority request.
-                if self.policy == SchedulingPolicy.PRIORITY:
-                    preempted_req = max(
-                        self.running,
-                        key=lambda r: (r.priority, r.arrival_time),
-                    )
-                    self.running.remove(preempted_req)
-                    if preempted_req in scheduled_running_reqs:
-                        scheduled_running_reqs.remove(preempted_req)
-                        token_budget += num_scheduled_tokens[preempted_req.request_id]
-                        req_to_new_blocks.pop(preempted_req.request_id)
-                        num_scheduled_tokens.pop(preempted_req.request_id)
-                        req_index -= 1
-                else:
-                    preempted_req = self.running.pop()
-
-                self.kv_cache_manager.free(preempted_req)
-                self.encoder_cache_manager.free(preempted_req)
-                preempted_req.status = RequestStatus.PREEMPTED
-                preempted_req.num_computed_tokens = 0
-                preempted_req.num_preemptions += 1
-                if self.log_stats:
-                    preempted_req.record_event(
-                        EngineCoreEventType.PREEMPTED, scheduled_timestamp
+            with record_function_or_nullcontext("schedule: allocate_slots"):
+                while True:
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens,
                     )
 
-                self.waiting.prepend_request(preempted_req)
-                preempted_reqs.append(preempted_req)
-                if preempted_req == request:
-                    # No more request to preempt. Cannot schedule this request.
-                    break
+                    if new_blocks is not None:
+                        # The request can be scheduled.
+                        break
+
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    if self.policy == SchedulingPolicy.PRIORITY:
+                        preempted_req = max(
+                            self.running,
+                            key=lambda r: (r.priority, r.arrival_time),
+                        )
+                        self.running.remove(preempted_req)
+                        if preempted_req in scheduled_running_reqs:
+                            scheduled_running_reqs.remove(preempted_req)
+                            token_budget += num_scheduled_tokens[
+                                preempted_req.request_id
+                            ]
+                            req_to_new_blocks.pop(preempted_req.request_id)
+                            num_scheduled_tokens.pop(preempted_req.request_id)
+                            req_index -= 1
+                    else:
+                        preempted_req = self.running.pop()
+
+                    self.kv_cache_manager.free(preempted_req)
+                    self.encoder_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+                    preempted_req.num_preemptions += 1
+                    if self.log_stats:
+                        preempted_req.record_event(
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp
+                        )
+
+                    self.waiting.prepend_request(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt. Cannot schedule this request.
+                        break
 
             if new_blocks is None:
                 # Cannot schedule this request.
@@ -327,6 +329,9 @@ class Scheduler(SchedulerInterface):
                     scheduled_spec_decode_tokens[request.request_id] = (
                         request.spec_token_ids
                     )
+                # New spec tokens will be set in `update_draft_token_ids` before the
+                # next step when applicable.
+                request.spec_token_ids = []
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -598,13 +603,14 @@ class Scheduler(SchedulerInterface):
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
         num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
-        if self.running:
-            any_request = self.running[0]
-            num_common_prefix_blocks = (
-                self.kv_cache_manager.get_num_common_prefix_blocks(
-                    any_request.request_id
+        with record_function_or_nullcontext("schedule: get_num_common_prefix_blocks"):
+            if self.running:
+                any_request = self.running[0]
+                num_common_prefix_blocks = (
+                    self.kv_cache_manager.get_num_common_prefix_blocks(
+                        any_request.request_id
+                    )
                 )
-            )
 
         # Construct the scheduler output.
         new_reqs_data = [
@@ -613,13 +619,14 @@ class Scheduler(SchedulerInterface):
             )
             for req in scheduled_new_reqs
         ]
-        cached_reqs_data = self._make_cached_request_data(
-            scheduled_running_reqs,
-            scheduled_resumed_reqs,
-            num_scheduled_tokens,
-            scheduled_spec_decode_tokens,
-            req_to_new_blocks,
-        )
+        with record_function_or_nullcontext("schedule: make_cached_request_data"):
+            cached_reqs_data = self._make_cached_request_data(
+                scheduled_running_reqs,
+                scheduled_resumed_reqs,
+                num_scheduled_tokens,
+                scheduled_spec_decode_tokens,
+                req_to_new_blocks,
+            )
 
         # Record the request ids that were scheduled in this step.
         self.prev_step_scheduled_req_ids.clear()
@@ -648,8 +655,8 @@ class Scheduler(SchedulerInterface):
         if self.connector is not None:
             meta = self.connector.build_connector_meta(scheduler_output)
             scheduler_output.kv_connector_metadata = meta
-
-        self._update_after_schedule(scheduler_output)
+        with record_function_or_nullcontext("schedule: update_after_schedule"):
+            self._update_after_schedule(scheduler_output)
         return scheduler_output
 
     def _update_after_schedule(
@@ -1025,6 +1032,7 @@ class Scheduler(SchedulerInterface):
                         kv_transfer_params=kv_transfer_params,
                         trace_headers=request.trace_headers,
                         num_cached_tokens=request.num_cached_tokens,
+                        num_nans_in_logits=request.num_nans_in_logits,
                     )
                 )
             else:
@@ -1150,10 +1158,7 @@ class Scheduler(SchedulerInterface):
                 continue
 
             # Add newly generated spec token ids to the request.
-            if not spec_token_ids:
-                # NOTE(woosuk): request.spec_token_ids should be updated.
-                request.spec_token_ids.clear()
-            elif self.structured_output_manager.should_advance(request):
+            if self.structured_output_manager.should_advance(request):
                 metadata = request.structured_output_request
                 request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
                     spec_token_ids
@@ -1261,7 +1266,6 @@ class Scheduler(SchedulerInterface):
             prefix_cache_stats=prefix_cache_stats,
             connector_prefix_cache_stats=connector_prefix_cache_stats,
             spec_decoding_stats=spec_decoding_stats,
-            num_corrupted_reqs=sum(req.is_output_corrupted for req in self.running),
             kv_connector_stats=kv_connector_stats.data if kv_connector_stats else None,
         )
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 8f14fb1894707..14ac83028ee44 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -243,18 +243,53 @@ class SingleTypeKVCacheManager(ABC):
 
         raise NotImplementedError
 
-    @abstractmethod
     def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks` and free the
-        blocks. The removed blocks should be replaced by null_block.
-        Need to be customized for each attention type.
+        Remove and free the blocks that are no longer needed for attention computation.
+        The removed blocks should be replaced by null_block.
+
+        This function depends on `get_num_skipped_tokens`, which need to be implemented
+        differently for each attention type.
 
         Args:
             request_id: The request ID.
             num_computed_tokens: The number of tokens that have been computed.
         """
-        raise NotImplementedError
+        # Remove the blocks that will be skipped during attention computation.
+        num_skipped_tokens = self.get_num_skipped_tokens(num_computed_tokens)
+        if num_skipped_tokens <= 0:
+            # This indicates that ALL tokens are inside attention window.
+            # Thus we do not need to free any blocks outside attention window.
+            # A typical case is full attention that we never free any token
+            # before the request is finished.
+            return
+        num_skipped_blocks = num_skipped_tokens // self.block_size
+        blocks = self.req_to_blocks[request_id]
+        removed_blocks: list[KVCacheBlock] = []
+        # Because the block starts from index 0, the num_skipped_block-th block
+        # corresponds to index num_skipped_blocks - 1.
+        for i in range(num_skipped_blocks - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        self.block_pool.free_blocks(removed_blocks)
+
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
+
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        # The default behavior is to not skip any tokens.
+        return 0
 
 
 class FullAttentionManager(SingleTypeKVCacheManager):
@@ -298,10 +333,6 @@ class FullAttentionManager(SingleTypeKVCacheManager):
                 computed.pop()
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
-        # No need to remove blocks for full attention.
-        pass
-
     def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         blocks = self.req_to_blocks[running_request_id]
         num_common_blocks = 0
@@ -389,28 +420,33 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
                 computed.pop()
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
-        # Remove the blocks that are no longer be in the sliding window and
-        # skipped during the attention computation.
-        last_useful_token = num_computed_tokens - self.sliding_window + 1
-        last_useful_block = last_useful_token // self.block_size
-        if last_useful_block <= 0:
-            # Early return if tokens are not enough to fill the sliding window
-            return
-        blocks = self.req_to_blocks[request_id]
-        if blocks[last_useful_block - 1] == self._null_block:
-            # Early return if there are no blocks to remove
-            return
-        removed_blocks: list[KVCacheBlock] = []
-        for i in range(last_useful_block - 1, -1, -1):
-            if blocks[i] == self._null_block:
-                # If the block is already a null block, the blocks before it
-                # should also have been set to null blocks by the previous calls
-                # to this function.
-                break
-            removed_blocks.append(blocks[i])
-            blocks[i] = self._null_block
-        self.block_pool.free_blocks(removed_blocks)
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
+
+        For sliding window, this corresponds to the tokens that are prior to
+        the current sliding window.
+
+        Example:
+        sliding_window=4, num_computed_tokens=7
+
+        Tokens:   [ 0  1  2  3  4  5  6  7 ]
+                  | ---- computed -----|
+                                         ^ next token to be computed
+                               |-----------| sliding window for next token
+                  |--skipped---|
+
+        The current window contains tokens 4~7. Tokens 0~3 will be skipped for
+        attention computation since they are outside the sliding window.
+        Thus, get_num_skipped_tokens(7) == 4.
+
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        return num_computed_tokens - self.sliding_window + 1
 
     def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
@@ -511,40 +547,51 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
                 break
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
-        # Remove the blocks that are no longer be in the chunked attention
-        # window and skipped during the attention computation.
+    def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
+        """
+        Get the number of tokens that will be skipped for attention computation.
 
-        # [chunk 0][chunk 1]local_attention_start_idx ... current
-        # we computed previous number of chunks to get the idx of
-        # current chunk window starting offset,
-        # e.g. for computed 1024 tokens, the 1024th token (0 indexed)
-        # is in the second chunk, there are 1 prev chunk, the start idx
-        # is 1024. for 1023, it will be 0.
-        num_cached_block = self.num_cached_block.get(request_id, 0)
-        local_attention_start_idx = (
-            (num_computed_tokens)
-            // self.attention_chunk_size
-            * self.attention_chunk_size
-        )
-        first_useful_block_idx = local_attention_start_idx // self.block_size
-        if num_cached_block > 0:
-            # Make sure we don't delete the last cached block
-            first_useful_block_idx = min(first_useful_block_idx, num_cached_block - 1)
-        # if block size = 128, 0 -> block 0, 1024 (= 128 * 8) ->
-        # block 8, 372 (= 128 * 2 + 116) -> block 2
-        blocks = self.req_to_blocks[request_id]
-        removed_blocks: list[KVCacheBlock] = []
-        # we need to keep the last block to get the previous hash key
-        for i in range(first_useful_block_idx - 1, -1, -1):
-            if blocks[i] == self._null_block:
-                # If the block is already a null block, the blocks before it
-                # should also have been set to null blocks by the previous calls
-                # to this function.
-                break
-            removed_blocks.append(blocks[i])
-            blocks[i] = self._null_block
-        self.block_pool.free_blocks(removed_blocks)
+        For chunked local attention, this corresponds to the tokens that are on
+        the left side of the current chunk.
+
+        Example 1:
+        chunk size = 8, num_computed_tokens = 13
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 | ----- computed ---------------|
+                                                  ^^ next token to be computed
+                                   |----------------| <-- attention window for
+                                                          next token
+                 |--- skipped -----|
+        Output: get_num_skipped_tokens(13) == 8
+
+        Example 2:
+        chunk size = 8, num_computed_tokens = 8
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 | --- computed ---|
+                                     ^ next token to be computed
+                                   |--| <-- attention window for next token
+                 | --- skipped ----|
+        Output: get_num_skipped_tokens(8) == 8
+
+        Example 3:
+        chunk size = 8, num_computed_tokens = 7
+        Tokens:  [ 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 ] ...
+                 |---computed---|
+                                 ^ next token to be computed
+                 |-----------------| <-- attention window for next token
+                 no token should be skipped.
+        Output: get_num_skipped_tokens(7) == 0
+
+        Args:
+            num_computed_tokens: The number of tokens that have been computed.
+
+        Returns:
+            The number of tokens that will be skipped for attention computation.
+        """
+        num_skipped_tokens = (
+            num_computed_tokens // self.attention_chunk_size
+        ) * self.attention_chunk_size
+        return num_skipped_tokens
 
     def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
@@ -590,12 +637,6 @@ class MambaManager(SingleTypeKVCacheManager):
 
         return computed_blocks
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
-        # Here unused blocks may be freed up for running requests.
-        # TODO(@s3woz) Free up all blocks that aren't needed by Mamba2
-        #  (for which find_longest_cache_hit returns block_pool.null_block)
-        pass
-
     def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
         cascade attention is not supported by mamba
@@ -676,11 +717,6 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         # Return empty blocks to indicate no cache hits
         raise NotImplementedError("CrossAttentionManager does not support caching")
 
-    def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> None:
-        # Cross-attention blocks represent encoder states which are needed
-        # for the entire decoding process, so no blocks should be skipped
-        pass
-
 
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
     FullAttentionSpec: FullAttentionManager,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index e2c1ed7b561c7..058a4bcaecb58 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -122,6 +122,10 @@ class EngineCoreOutput(
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
 
+    # The number of NaNs in logits.
+    # A value greater than 0 indicates that the output is corrupted.
+    num_nans_in_logits: int = 0
+
     @property
     def finished(self) -> bool:
         return self.finish_reason is not None
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dc61d45015682..aee21fb3fffe7 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -88,14 +88,6 @@ class AsyncLLM(EngineClient):
         Returns:
             None
         """
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github."
-            )
-
         # Ensure we can serialize custom transformer configs
         maybe_register_config_serialize_by_value()
 
@@ -206,14 +198,6 @@ class AsyncLLM(EngineClient):
         client_index: int = 0,
         disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github."
-            )
-
         # Create the LLMEngine.
         return cls(
             vllm_config=vllm_config,
@@ -524,6 +508,8 @@ class AsyncLLM(EngineClient):
                             processed_outputs.reqs_to_abort
                         )
 
+                    output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
                     # 4) Logging.
                     # TODO(rob): make into a coroutine and launch it in
                     # background thread once Prometheus overhead is non-trivial.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 78af197821e2e..ffb5232e770d1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
 import os
 import queue
 import signal
@@ -27,9 +26,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import engine_receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.utils.gc_utils import maybe_attach_gc_debug_callback
+from vllm.utils.gc_utils import (
+    freeze_gc_heap,
+    maybe_attach_gc_debug_callback,
+)
 from vllm.utils.hashing import get_hash_fn_by_name
-from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import make_zmq_socket
 from vllm.utils.system_utils import decorate_logs, set_process_title
 from vllm.v1.core.kv_cache_utils import (
@@ -41,7 +42,6 @@ from vllm.v1.core.kv_cache_utils import (
 )
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (
     EngineCoreOutputs,
     EngineCoreRequest,
@@ -63,6 +63,7 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import record_function_or_nullcontext
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -117,23 +118,7 @@ class EngineCore:
         self.structured_output_manager = StructuredOutputManager(vllm_config)
 
         # Setup scheduler.
-        if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
-            Scheduler = resolve_obj_by_qualname(
-                vllm_config.scheduler_config.scheduler_cls
-            )
-        else:
-            Scheduler = vllm_config.scheduler_config.scheduler_cls
-
-        # This warning can be removed once the V1 Scheduler interface is
-        # finalized and we can maintain support for scheduler classes that
-        # implement it
-        if Scheduler is not V1Scheduler:
-            logger.warning(
-                "Using configured V1 scheduler class %s. "
-                "This scheduler interface is not public and "
-                "compatibility may not be maintained.",
-                vllm_config.scheduler_config.scheduler_cls,
-            )
+        Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
 
         if len(kv_cache_config.kv_cache_groups) == 0:
             # Encoder models without KV cache don't support
@@ -214,6 +199,10 @@ class EngineCore:
             self.step if self.batch_queue is None else self.step_with_batch_queue
         )
 
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        freeze_gc_heap()
+
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
     ) -> tuple[int, int, KVCacheConfig]:
@@ -333,17 +322,21 @@ class EngineCore:
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
             return {}, False
-        scheduler_output = self.scheduler.schedule()
-        future = self.model_executor.execute_model(scheduler_output, non_block=True)
-        grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
-        with self.log_error_detail(scheduler_output):
-            model_output = future.result()
-            if model_output is None:
-                model_output = self.model_executor.sample_tokens(grammar_output)
+        with record_function_or_nullcontext("core step: schedule"):
+            scheduler_output = self.scheduler.schedule()
 
-        engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, model_output
-        )
+        with record_function_or_nullcontext("core step: execute_model"):
+            future = self.model_executor.execute_model(scheduler_output, non_block=True)
+            grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+            with self.log_error_detail(scheduler_output):
+                model_output = future.result()
+                if model_output is None:
+                    model_output = self.model_executor.sample_tokens(grammar_output)
+
+        with record_function_or_nullcontext("core step: update_from_output"):
+            engine_core_outputs = self.scheduler.update_from_output(
+                scheduler_output, model_output
+            )
 
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
@@ -381,32 +374,49 @@ class EngineCore:
         model_executed = False
         deferred_scheduler_output = None
         if self.scheduler.has_requests():
-            scheduler_output = self.scheduler.schedule()
-            exec_future = self.model_executor.execute_model(
-                scheduler_output, non_block=True
-            )
+            with record_function_or_nullcontext("core step_with_batch_queue: schedule"):
+                scheduler_output = self.scheduler.schedule()
+            with record_function_or_nullcontext(
+                "core step_with_batch_queue: execute_model"
+            ):
+                exec_future = self.model_executor.execute_model(
+                    scheduler_output, non_block=True
+                )
             model_executed = scheduler_output.total_num_scheduled_tokens > 0
 
             if scheduler_output.pending_structured_output_tokens:
-                # We need to defer sampling until we have processed the model output
-                # from the prior step.
-                deferred_scheduler_output = scheduler_output
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
-                    assert exec_result is None
+                with record_function_or_nullcontext(
+                    "core step_with_batch_queue: pending_structured_output_tokens"
+                ):
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+                    # Block-wait for execute to return
+                    # (continues running async on the GPU).
+                    with self.log_error_detail(scheduler_output):
+                        exec_result = exec_future.result()
+                        assert exec_result is None
             else:
-                # We aren't waiting for any tokens, get any grammar output immediately.
-                grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+                with record_function_or_nullcontext(
+                    "core step_with_batch_queue: get_grammar_bitmask"
+                ):
+                    # We aren't waiting for any tokens, get any grammar
+                    # output immediately.
+                    grammar_output = self.scheduler.get_grammar_bitmask(
+                        scheduler_output
+                    )
                 # Block-wait for execute to return (continues running async on the GPU).
                 with self.log_error_detail(scheduler_output):
                     exec_result = exec_future.result()
 
                 if exec_result is None:
-                    # Call sample tokens.
-                    future = self.model_executor.sample_tokens(
-                        grammar_output, non_block=True
-                    )
+                    with record_function_or_nullcontext(
+                        "core step_with_batch_queue: sample_tokens"
+                    ):
+                        # Call sample tokens.
+                        future = self.model_executor.sample_tokens(
+                            grammar_output, non_block=True
+                        )
                 else:
                     # No sampling required (e.g. all requests finished).
                     future = cast(Future[ModelRunnerOutput], exec_future)
@@ -426,27 +436,34 @@ class EngineCore:
             # only be called when the scheduler contains requests or the queue
             # is non-empty.
             return None, False
-
-        # Block until the next result is available.
-        future, scheduler_output = batch_queue.pop()
-        with self.log_error_detail(scheduler_output):
-            model_output = future.result()
-
-        engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, model_output
-        )
+        with record_function_or_nullcontext("core step_with_batch_queue: model_output"):
+            # Block until the next result is available.
+            future, scheduler_output = batch_queue.pop()
+            with self.log_error_detail(scheduler_output):
+                model_output = future.result()
+        with record_function_or_nullcontext(
+            "core step_with_batch_queue: update_from_output"
+        ):
+            engine_core_outputs = self.scheduler.update_from_output(
+                scheduler_output, model_output
+            )
 
         # NOTE(nick): We can either handle the deferred tasks here or save
         # in a field and do it immediately once step_with_batch_queue is
         # re-called. The latter slightly favors TTFT over TPOT/throughput.
         if deferred_scheduler_output:
-            # We now have the tokens needed to compute the bitmask for the
-            # deferred request. Get the bitmask and call sample tokens.
-            grammar_output = self.scheduler.get_grammar_bitmask(
-                deferred_scheduler_output
-            )
-            future = self.model_executor.sample_tokens(grammar_output, non_block=True)
-            batch_queue.appendleft((future, deferred_scheduler_output))
+            with record_function_or_nullcontext(
+                "core step_with_batch_queue: deferred_scheduler_output"
+            ):
+                # We now have the tokens needed to compute the bitmask for the
+                # deferred request. Get the bitmask and call sample tokens.
+                grammar_output = self.scheduler.get_grammar_bitmask(
+                    deferred_scheduler_output
+                )
+                future = self.model_executor.sample_tokens(
+                    grammar_output, non_block=True
+                )
+                batch_queue.appendleft((future, deferred_scheduler_output))
 
         return engine_core_outputs, model_executed
 
@@ -640,11 +657,6 @@ class EngineCoreProc(EngineCore):
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # Mark the startup heap as static so that it's ignored by GC.
-        # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
-
         # If enable, attach GC debugger after static variable freeze.
         maybe_attach_gc_debug_callback()
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c2ca9579d55ea..d27d13840989e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -36,6 +36,7 @@ from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
@@ -58,20 +59,6 @@ class LLMEngine:
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
-                "This should not happen. As a workaround, try using "
-                "LLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github."
-            )
-
-        if stat_loggers is not None:
-            raise NotImplementedError(
-                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
-                "Set VLLM_USE_V1=0 and file and issue on Github."
-            )
-
         self.vllm_config = vllm_config
         self.observability_config = vllm_config.observability_config
         self.model_config = vllm_config.model_config
@@ -294,27 +281,32 @@ class LLMEngine:
             return []
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        outputs = self.engine_core.get_output()
+        with record_function_or_nullcontext("llm_genine step: get_output"):
+            outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
-        iteration_stats = IterationStats() if self.log_stats else None
-        processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs,
-            engine_core_timestamp=outputs.timestamp,
-            iteration_stats=iteration_stats,
-        )
+        with record_function_or_nullcontext("llm_genine step: process_outputs"):
+            iteration_stats = IterationStats() if self.log_stats else None
+            processed_outputs = self.output_processor.process_outputs(
+                outputs.outputs,
+                engine_core_timestamp=outputs.timestamp,
+                iteration_stats=iteration_stats,
+            )
+            self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
-        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+        with record_function_or_nullcontext("llm_genine step: abort_requests"):
+            self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
         # 4) Record stats
-        if self.logger_manager is not None and outputs.scheduler_stats is not None:
-            self.logger_manager.record(
-                scheduler_stats=outputs.scheduler_stats,
-                iteration_stats=iteration_stats,
-                mm_cache_stats=self.processor.stat_mm_cache(),
-            )
-            self.do_log_stats_with_interval()
+        with record_function_or_nullcontext("llm_genine step: record_stats"):
+            if self.logger_manager is not None and outputs.scheduler_stats is not None:
+                self.logger_manager.record(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=iteration_stats,
+                    mm_cache_stats=self.processor.stat_mm_cache(),
+                )
+                self.do_log_stats_with_interval()
 
         return processed_outputs.request_outputs
 
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 48bb5312f5d94..4c5955d7ee2e5 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -2,11 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from collections.abc import Iterable
 from dataclasses import dataclass
 
 from vllm.logger import init_logger
-from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.logprobs import (
+    PromptLogprobs,
+    SampleLogprobs,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer,
     convert_ids_list_to_tokens,
@@ -44,9 +49,10 @@ class LogprobsProcessor:
         return cls(
             tokenizer=tokenizer,
             cumulative_logprob=(None if num_logprobs is None else 0.0),
-            logprobs=(None if num_logprobs is None else []),
-            # NOTE: logprob of first prompt token is None.
-            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            logprobs=(None if num_logprobs is None else create_sample_logprobs()),
+            prompt_logprobs=(
+                None if num_prompt_logprobs is None else create_prompt_logprobs()
+            ),
             num_prompt_logprobs=num_prompt_logprobs,
             num_logprobs=num_logprobs,
         )
@@ -80,15 +86,14 @@ class LogprobsProcessor:
             sampled_token_logprob = logprobs[0]
             self.cumulative_logprob += sampled_token_logprob
 
-            # Update with the Logprob dictionary for this pos.
-            self.logprobs.append(
-                self._make_logprob_dict(
-                    logprobs,
-                    token_ids,
-                    decoded_tokens,
-                    rank,
-                    self.num_logprobs,
-                )
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.logprobs,
+                token_ids,
+                logprobs,
+                decoded_tokens,
+                rank,
+                self.num_logprobs,
             )
 
     def _update_prompt_logprobs(
@@ -136,15 +141,14 @@ class LogprobsProcessor:
                 NONES if decoded_tokens is None else decoded_tokens[offset:offset_end]
             )
 
-            # Update with the Logprob dictionary for this pos.
-            self.prompt_logprobs.append(
-                self._make_logprob_dict(
-                    prompt_logprobs[pos],
-                    token_ids[pos],
-                    decoded_tokens_for_pos,
-                    prompt_token_ranks[pos],
-                    self.num_prompt_logprobs,
-                )
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.prompt_logprobs,
+                token_ids[pos],
+                prompt_logprobs[pos],
+                decoded_tokens_for_pos,
+                prompt_token_ranks[pos],
+                self.num_prompt_logprobs,
             )
 
     def pop_prompt_logprobs(self) -> PromptLogprobs | None:
@@ -166,46 +170,6 @@ class LogprobsProcessor:
             self.prompt_logprobs = []
         return plp
 
-    @staticmethod
-    def _make_logprob_dict(
-        logprobs: list[float],
-        logprob_token_ids: list[int],
-        decoded_tokens: Iterable[str | None],
-        rank: int,
-        num_logprobs: int,
-    ) -> dict[int, Logprob]:
-        """Make a Logprob dictionary for a position.
-
-        Args:
-          logprobs: list of log probabilities
-          logprob_token_ids: list of top token ids
-          decoded_tokens: list of decoded top tokens
-          rank: rank of the sampled token
-          num_logprobs: number of logprobs requested
-            by the user (in addition to sampled logprob)
-
-        Returns:
-          dict[token id, Logprob]
-        """
-        if num_logprobs == -1:
-            num_logprobs = len(logprobs)
-        # We do not need a special case for the sampled token
-        # being in the topk, since inserting duplicated data
-        # into a dictionary twice is the same as doing it once.
-        topk_ranks = range(1, num_logprobs + 1)
-        ranks = itertools.chain((rank,), topk_ranks)
-
-        return {
-            token_id: Logprob(
-                logprob=logprob,
-                rank=rank,
-                decoded_token=token,
-            )
-            for token_id, logprob, rank, token in zip(
-                logprob_token_ids, logprobs, ranks, decoded_tokens
-            )
-        }
-
     def update_from_output(self, output: EngineCoreOutput) -> None:
         if output.new_logprobs is not None:
             self._update_sample_logprobs(output.new_logprobs)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 07c8113dd9b33..d8d03f19d4663 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -22,7 +22,12 @@ from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.metrics.stats import IterationStats, LoRARequestStates, RequestStateStats
+from vllm.v1.metrics.stats import (
+    IterationStats,
+    LoRARequestStates,
+    RequestStateStats,
+    SchedulerStats,
+)
 
 
 class RequestOutputCollector:
@@ -310,7 +315,7 @@ class OutputProcessor:
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
-        self.lora_states = LoRARequestStates()
+        self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
 
     def get_num_unfinished_requests(self):
@@ -334,7 +339,7 @@ class OutputProcessor:
         for request_id in request_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
-                self.lora_states.abort_request(req_state)
+                self.lora_states.request_finished(request_id, req_state.lora_name)
                 request_ids_to_abort.append(request_id)
                 # Produce final abort output.
                 if req_state.queue is not None and (
@@ -382,7 +387,6 @@ class OutputProcessor:
             log_stats=self.log_stats,
         )
         self.request_states[request_id] = req_state
-        self.lora_states.add_request(req_state)
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
 
@@ -484,13 +488,15 @@ class OutputProcessor:
                 )
                 if self.tracer:
                     self.do_tracing(engine_core_output, req_state, iteration_stats)
-        self.lora_states.update_iteration_stats(iteration_stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
         )
 
+    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
+        self.lora_states.update_scheduler_stats(scheduler_stats)
+
     def do_tracing(
         self,
         engine_core_output: EngineCoreOutput,
@@ -564,8 +570,6 @@ class OutputProcessor:
         if iteration_stats is None:
             return
 
-        lora_stats = self.lora_states.get_stats(req_state)
-
         assert engine_core_timestamp is not None
         assert req_state.stats is not None
         iteration_stats.update_from_output(
@@ -574,7 +578,8 @@ class OutputProcessor:
             req_state.is_prefilling,
             req_state.prompt_len,
             req_state.stats,
-            lora_stats,
+            self.lora_states,
+            req_state.lora_name,
         )
 
     def _update_stats_from_finished(
@@ -596,7 +601,7 @@ class OutputProcessor:
             max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats,
         )
-        self.lora_states.finish_request(req_state)
+        self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
 
         ParentRequest.observe_finished_request(
             req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index d76c6107ad2ba..1e913876b7635 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -171,7 +171,7 @@ class Executor(ABC):
         args: tuple = (),
         kwargs: dict | None = None,
         non_block: Literal[True] = True,
-    ) -> list[Future[_R]]:
+    ) -> Future[list[_R]]:
         pass
 
     @abstractmethod
@@ -219,7 +219,7 @@ class Executor(ABC):
 
     def sample_tokens(
         self, grammar_output: GrammarOutput | None, non_block: bool = False
-    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
         output = self.collective_rpc(  # type: ignore[call-overload]
             "sample_tokens", args=(grammar_output,), non_block=non_block
         )
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 999a3ba870ead..1e249161c6886 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,8 +9,10 @@ import threading
 import time
 import traceback
 import weakref
+from collections import deque
 from collections.abc import Callable
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import Future, InvalidStateError
+from contextlib import suppress
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import cached_property, partial
@@ -27,6 +29,7 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
 from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.parallel_state import (
     get_dp_group,
     get_ep_group,
@@ -54,6 +57,35 @@ from vllm.v1.worker.worker_base import WorkerWrapperBase
 logger = init_logger(__name__)
 
 
+class FutureWrapper(Future):
+    def __init__(
+        self,
+        futures_queue: deque[tuple["FutureWrapper", Callable]],
+        aggregate: Callable = lambda x: x,
+    ):
+        self.futures_queue = futures_queue
+        self.aggregate = aggregate
+        super().__init__()
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise RuntimeError("timeout not implemented")
+        # Drain any futures ahead of us in the queue.
+        while not self.done():
+            future, get_response = self.futures_queue.pop()
+            future.wait_for_response(get_response)
+        return super().result()
+
+    def wait_for_response(self, get_response: Callable):
+        try:
+            response = self.aggregate(get_response())
+            with suppress(InvalidStateError):
+                self.set_result(response)
+        except Exception as e:
+            with suppress(InvalidStateError):
+                self.set_exception(e)
+
+
 class MultiprocExecutor(Executor):
     supports_pp: bool = True
 
@@ -64,7 +96,6 @@ class MultiprocExecutor(Executor):
         self.is_failed = False
         self.shutdown_event = threading.Event()
         self.failure_callback: FailureCallback | None = None
-        self.io_thread_pool: ThreadPoolExecutor | None = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -132,15 +163,9 @@ class MultiprocExecutor(Executor):
                         uw.death_writer.close()
                 self._ensure_worker_termination([uw.proc for uw in unready_workers])
 
-        # Note: must use only 1 IO thread to keep dequeue sequence
-        # from the response queue.
-        # _async_aggregate_workers_output also assumes a single IO thread.
-        self.io_thread_pool = ThreadPoolExecutor(
-            max_workers=1, thread_name_prefix="mp_exec_io"
-        )
+        self.futures_queue = deque[tuple[FutureWrapper, Callable]]()
 
         self.output_rank = self._get_output_rank()
-        self.has_connector = self.vllm_config.kv_transfer_config is not None
 
     def start_worker_monitor(self):
         workers = self.workers
@@ -179,56 +204,37 @@ class MultiprocExecutor(Executor):
     def execute_model(  # type: ignore[override]
         self, scheduler_output: SchedulerOutput, non_block: bool = False
     ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
-        return self._execute_with_aggregation(
-            "execute_model", scheduler_output, non_block=non_block
+        return self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output,),
+            unique_reply_rank=self.output_rank,
+            non_block=non_block,
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
         )
 
     def sample_tokens(  # type: ignore[override]
         self, grammar_output: GrammarOutput | None, non_block: bool = False
     ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
-        return self._execute_with_aggregation(  # type: ignore[return-value]
-            "sample_tokens", grammar_output, non_block=non_block
-        )
-
-    def _execute_with_aggregation(
-        self, method: str, *args, non_block: bool = False
-    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
-        if not self.has_connector:
-            # get output only from a single worker (output_rank)
-            (output,) = self.collective_rpc(
-                method,
-                args=args,
-                unique_reply_rank=self.output_rank,
-                non_block=non_block,
-                timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
-            )
-            return output
-
-        # get output from all workers
-        outputs = self.collective_rpc(
-            method,
-            args=args,
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            unique_reply_rank=self.output_rank,
             non_block=non_block,
             timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
         )
 
-        # aggregate all workers output to a single output
-        assert self.kv_output_aggregator is not None
-        if non_block:
-            return self.kv_output_aggregator.async_aggregate(outputs, self.output_rank)
-        return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
-
     def execute_dummy_batch(self) -> None:
         self.collective_rpc("execute_dummy_batch", unique_reply_rank=self.output_rank)
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
         # OPTIMIZATION: Get output only from a single worker (output_rank)
-        outputs = self.collective_rpc(
+        return self.collective_rpc(
             "take_draft_token_ids", unique_reply_rank=self.output_rank
         )
-        return outputs[0]
 
-    def collective_rpc(
+    def collective_rpc(  # type: ignore[override]
         self,
         method: str | Callable,
         timeout: float | None = None,
@@ -236,73 +242,69 @@ class MultiprocExecutor(Executor):
         kwargs: dict | None = None,
         non_block: bool = False,
         unique_reply_rank: int | None = None,
-    ) -> list[Any]:
+        kv_output_aggregator: KVOutputAggregator = None,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
+        """Returns single result if unique_reply_rank and/or kv_output_aggregator
+        is provided, otherwise list."""
+
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
         deadline = None if timeout is None else time.monotonic() + timeout
         kwargs = kwargs or {}
 
-        # NOTE: If the args are heterogeneous, then we pack them into a list,
-        # and unpack them in the method of every worker, because every worker
-        # knows their own rank.
-        try:
-            if isinstance(method, str):
-                send_method = method
-            else:
-                send_method = cloudpickle.dumps(
-                    method, protocol=pickle.HIGHEST_PROTOCOL
-                )
-            self.rpc_broadcast_mq.enqueue(
-                (send_method, args, kwargs, unique_reply_rank)
+        if kv_output_aggregator is not None:
+            output_rank = None
+            aggregate: Callable[[Any], Any] = partial(
+                kv_output_aggregator.aggregate, output_rank=unique_reply_rank or 0
             )
+        else:
+            output_rank = unique_reply_rank
+            aggregate = lambda x: x
 
-            workers = (
-                (self.workers[unique_reply_rank],)
-                if unique_reply_rank is not None
-                else self.workers
-            )
+        if isinstance(method, str):
+            send_method = method
+        else:
+            send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
+        self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
+
+        workers = (
+            (self.workers[output_rank],) if output_rank is not None else self.workers
+        )
+
+        shutdown_event = self.shutdown_event
+
+        def get_response():
             responses = []
-
-            def get_response(
-                w: WorkerProcHandle,
-                dequeue_timeout: float | None = None,
-                cancel_event: threading.Event | None = None,
-            ):
-                status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout, cancel=cancel_event
+            for w in workers:
+                dequeue_timeout = (
+                    None if deadline is None else (deadline - time.monotonic())
                 )
-
+                try:
+                    status, result = w.worker_response_mq.dequeue(
+                        timeout=dequeue_timeout, cancel=shutdown_event
+                    )
+                except TimeoutError as e:
+                    raise TimeoutError(f"RPC call to {method} timed out.") from e
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
                         f"Worker failed with error '{result}', please check the"
                         " stack trace above for the root cause"
                     )
-                return result
-
-            for w in workers:
-                dequeue_timeout = (
-                    None if deadline is None else (deadline - time.monotonic())
-                )
-
-                if self.io_thread_pool is not None:
-                    # We must consume worker_response_mq from a single thread.
-                    result = self.io_thread_pool.submit(  # type: ignore
-                        get_response, w, dequeue_timeout, self.shutdown_event
-                    )
-                    if not non_block:
-                        result = result.result()
-                elif not non_block:
-                    result = get_response(w, dequeue_timeout, self.shutdown_event)
-                else:
-                    raise RuntimeError(
-                        "non_block can only be used when max_concurrent_batches > 1"
-                    )
                 responses.append(result)
+            return responses[0] if output_rank is not None else responses
 
-            return responses
-        except TimeoutError as e:
-            raise TimeoutError(f"RPC call to {method} timed out.") from e
+        if non_block:
+            future = FutureWrapper(self.futures_queue, aggregate=aggregate)
+            self.futures_queue.appendleft((future, get_response))
+            return future
+
+        # First drain any pending futures in the queue.
+        while self.futures_queue:
+            future, get_fut_response = self.futures_queue.pop()
+            future.wait_for_response(get_fut_response)
+
+        return aggregate(get_response())
 
     @staticmethod
     def _ensure_worker_termination(worker_procs: list[BaseProcess]):
@@ -348,9 +350,6 @@ class MultiprocExecutor(Executor):
                 self._ensure_worker_termination([w.proc for w in workers])
 
             self.shutdown_event.set()
-            if self.io_thread_pool is not None:
-                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
-                del self.io_thread_pool
 
         self.rpc_broadcast_mq = None
 
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 4a69cca723ac9..119e4c0818316 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -435,26 +435,25 @@ class RayDistributedExecutor(Executor):
 
             # When PP is used, we return a FutureWrapper immediately so that
             # the scheduler can yield to the next batch.
-            return FutureWrapper(refs)
+            return FutureWrapper(refs[0])
 
         # Get output from all workers when connector is present
         assert self.kv_output_aggregator is not None
         if not non_block:
             # Block and get results from all workers
-            outputs = [ref.get() for ref in refs]
-            return self.kv_output_aggregator.aggregate(outputs)
+            return self.kv_output_aggregator.aggregate(ray.get(refs))
 
         # Return a future that will aggregate outputs from all workers
         return FutureWrapper(refs, self.kv_output_aggregator)
 
-    def collective_rpc(
+    def collective_rpc(  # type: ignore[override]
         self,
         method: str | Callable,
         timeout: float | None = None,
         args: tuple = (),
         kwargs: dict[str, Any] | None = None,
         non_block: bool = False,
-    ) -> list[Any]:
+    ) -> list[Any] | Future[list[Any]]:
         """Runs the given method on all workers."""
         sent_method = method if isinstance(method, str) else cloudpickle.dumps(method)
         del method
@@ -470,7 +469,7 @@ class RayDistributedExecutor(Executor):
 
         # Get the results of the ray workers.
         if non_block:
-            return [FutureWrapper((output,)) for output in ray_worker_outputs]
+            return FutureWrapper(ray_worker_outputs)
 
         return ray.get(ray_worker_outputs, timeout=timeout)
 
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index a282cdc9909db..21910d1160bd4 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -141,19 +141,16 @@ class FutureWrapper(Future):
     the result() call. If not only the first worker's output is returned.
     """
 
-    def __init__(self, refs, aggregator: KVOutputAggregator | None = None):
+    def __init__(self, ref_or_refs, aggregator: KVOutputAggregator | None = None):
         super().__init__()
-        self.refs = refs
+        self.ref_or_refs = ref_or_refs
         self.aggregator = aggregator
 
     def result(self, timeout=None):
-        if timeout is not None:
-            raise NotImplementedError("timeout is not supported")
-
+        outputs = ray.get(self.ref_or_refs, timeout=timeout)
         if self.aggregator is None:
-            return self.refs[0].get()
+            return outputs
 
-        outputs = [ref.get() for ref in self.refs]
         return self.aggregator.aggregate(outputs, output_rank=0)
 
 
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index f17d3c3092701..095d3d1dac21b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -13,9 +13,10 @@ import torch.distributed as dist
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.outputs import AsyncModelRunnerOutput
+from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
 from vllm.v1.serial_utils import run_method
 from vllm.v1.worker.worker_base import WorkerWrapperBase
 
@@ -58,32 +59,60 @@ class UniProcExecutor(Executor):
     def max_concurrent_batches(self) -> int:
         return 2 if self.scheduler_config.async_scheduling else 1
 
-    def collective_rpc(
+    def collective_rpc(  # type: ignore[override]
         self,
         method: str | Callable,
         timeout: float | None = None,
         args: tuple = (),
         kwargs: dict | None = None,
         non_block: bool = False,
-    ) -> list[Any]:
+        single_value: bool = False,
+    ) -> Any | list[Any] | Future[Any | list[Any]]:
         if kwargs is None:
             kwargs = {}
 
         if not non_block:
-            return [run_method(self.driver_worker, method, args, kwargs)]
+            result = run_method(self.driver_worker, method, args, kwargs)
+            return result if single_value else [result]
 
         try:
             result = run_method(self.driver_worker, method, args, kwargs)
             if isinstance(result, AsyncModelRunnerOutput):
                 if (async_thread := self.async_output_thread) is not None:
-                    return [async_thread.submit(result.get_output)]
+                    get_output = result.get_output
+                    if not single_value:
+                        get_output = lambda go=result.get_output: [go()]
+                    return async_thread.submit(get_output)
                 result = result.get_output()
             future = Future[Any]()
-            future.set_result(result)
+            future.set_result(result if single_value else [result])
         except Exception as e:
             future = Future[Any]()
             future.set_exception(e)
-        return [future]
+        return future
+
+    def execute_model(  # type: ignore[override]
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        return self.collective_rpc("take_draft_token_ids", single_value=True)
 
     def check_health(self) -> None:
         # UniProcExecutor will always be healthy as long as
@@ -124,11 +153,10 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
 
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
-        if envs.VLLM_USE_V1:
-            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
-                "To get deterministic execution in V1, "
-                "please set VLLM_ENABLE_V1_MULTIPROCESSING=0"
-            )
+        assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
+            "To get deterministic execution, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0"
+        )
         super()._init_executor()
 
     def _distributed_args(self) -> tuple[str, int, int]:
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 0f564fdb3b080..7f33eb7e699c7 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
-from dataclasses import dataclass, fields
+from dataclasses import dataclass, fields, replace
 from math import prod
 
 import torch
@@ -44,6 +44,12 @@ class KVCacheSpec:
         """
         raise NotImplementedError
 
+    def copy_with_new_block_size(self, block_size: int) -> Self:
+        """
+        Create a new KVCacheSpec from self but replacing the block size.
+        """
+        return replace(self, block_size=block_size)
+
     @classmethod
     def merge(cls, specs: list[Self]) -> Self:
         """
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 67b6ceaa847f6..1a175e9e110bd 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -117,11 +117,13 @@ class LoggingStatLogger(StatLoggerBase):
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: int = 0
         self.num_generation_tokens: int = 0
+        self.num_corrupted_reqs: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
         self.num_generation_tokens += iteration_stats.num_generation_tokens
+        self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
 
     def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
@@ -205,6 +207,10 @@ class LoggingStatLogger(StatLoggerBase):
             self.last_scheduler_stats.kv_cache_usage * 100,
             self.prefix_caching_metrics.hit_rate * 100,
         ]
+
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            log_parts.append("Corrupted: %d reqs")
+            log_args.append(self.num_corrupted_reqs)
         if not self.connector_prefix_caching_metrics.empty:
             log_parts.append("External prefix cache hit rate: %.1f%%")
             log_args.append(self.connector_prefix_caching_metrics.hit_rate * 100)
@@ -276,9 +282,6 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
             self.last_scheduler_stats.num_running_reqs += (
                 last_scheduler_stats.num_running_reqs
             )
-            self.last_scheduler_stats.num_corrupted_reqs += (
-                last_scheduler_stats.num_corrupted_reqs
-            )
             self.last_scheduler_stats.kv_cache_usage += (
                 last_scheduler_stats.kv_cache_usage
             )
@@ -395,32 +398,32 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         self.gauge_scheduler_waiting = make_per_engine(
             gauge_scheduler_waiting, engine_indexes, model_name
         )
-        if envs.VLLM_SERVER_DEV_MODE:
-            gauge_engine_sleep_state = self._gauge_cls(
-                name="vllm:engine_sleep_state",
-                documentation=(
-                    "Engine sleep state; awake = 0 means engine is sleeping; "
-                    "awake = 1 means engine is awake; "
-                    "weights_offloaded = 1 means sleep level 1; "
-                    "discard_all = 1 means sleep level 2."
-                ),
-                labelnames=labelnames + ["sleep_state"],
-                multiprocess_mode="mostrecent",
-            )
 
-            self.gauge_engine_sleep_state = {}
-            sleep_state = ["awake", "weights_offloaded", "discard_all"]
+        gauge_engine_sleep_state = self._gauge_cls(
+            name="vllm:engine_sleep_state",
+            documentation=(
+                "Engine sleep state; awake = 0 means engine is sleeping; "
+                "awake = 1 means engine is awake; "
+                "weights_offloaded = 1 means sleep level 1; "
+                "discard_all = 1 means sleep level 2."
+            ),
+            labelnames=labelnames + ["sleep_state"],
+            multiprocess_mode="mostrecent",
+        )
 
-            for s in sleep_state:
-                self.gauge_engine_sleep_state[s] = {
-                    idx: gauge_engine_sleep_state.labels(
-                        engine=idx, model_name=model_name, sleep_state=s
-                    )
-                    for idx in engine_indexes
-                }
+        self.gauge_engine_sleep_state = {}
+        sleep_state = ["awake", "weights_offloaded", "discard_all"]
 
-            # Setting default values
-            self.record_sleep_state()
+        for s in sleep_state:
+            self.gauge_engine_sleep_state[s] = {
+                idx: gauge_engine_sleep_state.labels(
+                    engine=idx, model_name=model_name, sleep_state=s
+                )
+                for idx in engine_indexes
+            }
+
+        # Setting default values
+        self.record_sleep_state()
 
         # GPU cache
         #
@@ -482,6 +485,19 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             gauge_kv_cache_usage, engine_indexes, model_name
         )
 
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            counter_corrupted_requests = self._counter_cls(
+                name="vllm:corrupted_requests",
+                documentation=(
+                    "Corrupted requests, in terms of total number of requests "
+                    "with NaNs in logits."
+                ),
+                labelnames=labelnames,
+            )
+            self.counter_corrupted_requests = make_per_engine(
+                counter_corrupted_requests, engine_indexes, model_name
+            )
+
         counter_prefix_cache_queries = self._counter_cls(
             name="vllm:prefix_cache_queries",
             documentation=(
@@ -934,7 +950,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.gauge_scheduler_waiting[engine_idx].set(
                 scheduler_stats.num_waiting_reqs
             )
-
             if self.show_hidden_metrics:
                 self.gauge_gpu_cache_usage[engine_idx].set(
                     scheduler_stats.kv_cache_usage
@@ -974,13 +989,30 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
                     scheduler_stats.kv_connector_stats, engine_idx
                 )
 
+            if self.gauge_lora_info is not None:
+                running_lora_adapters = ",".join(
+                    scheduler_stats.running_lora_adapters.keys()
+                )
+                waiting_lora_adapters = ",".join(
+                    scheduler_stats.waiting_lora_adapters.keys()
+                )
+                lora_info_labels = {
+                    self.labelname_running_lora_adapters: running_lora_adapters,
+                    self.labelname_waiting_lora_adapters: waiting_lora_adapters,
+                    self.labelname_max_lora: self.max_lora,
+                }
+                self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
+
         if mm_cache_stats is not None:
             self.counter_mm_cache_queries[engine_idx].inc(mm_cache_stats.queries)
             self.counter_mm_cache_hits[engine_idx].inc(mm_cache_stats.hits)
 
         if iteration_stats is None:
             return
-
+        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
+            self.counter_corrupted_requests[engine_idx].inc(
+                iteration_stats.num_corrupted_reqs
+            )
         self.counter_num_preempted_reqs[engine_idx].inc(
             iteration_stats.num_preempted_reqs
         )
@@ -1037,24 +1069,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
                     finished_request.max_tokens_param
                 )
 
-        if self.gauge_lora_info is not None:
-            running_lora_adapters = ",".join(
-                iteration_stats.running_lora_adapters.keys()
-            )
-            waiting_lora_adapters = ",".join(
-                iteration_stats.waiting_lora_adapters.keys()
-            )
-            lora_info_labels = {
-                self.labelname_running_lora_adapters: running_lora_adapters,
-                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
-                self.labelname_max_lora: self.max_lora,
-            }
-            self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
-
     def record_sleep_state(self, sleep: int = 0, level: int = 0):
-        if not envs.VLLM_SERVER_DEV_MODE:
-            return
-
         awake = 1
         discard_all = 0
         weights_offloaded = 0
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 7868141d1b1da..4e9db98db0bc2 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -2,15 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-from collections import deque
+from collections import defaultdict, deque
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
+import vllm.envs as envs
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
-    from vllm.v1.engine.output_processor import RequestState
 
 
 @dataclass
@@ -169,13 +169,8 @@ class SchedulerStats:
     spec_decoding_stats: SpecDecodingStats | None = None
     kv_connector_stats: dict[str, Any] | None = None
 
-    num_corrupted_reqs: int = 0
-
-
-@dataclass
-class LoRAStats:
-    waiting_requests: set[str] = field(default_factory=set)
-    running_requests: set[str] = field(default_factory=set)
+    waiting_lora_adapters: dict[str, int] = field(default_factory=dict)
+    running_lora_adapters: dict[str, int] = field(default_factory=dict)
 
 
 @dataclass
@@ -196,6 +191,9 @@ class RequestStateStats:
     # first token latency
     first_token_latency: float = 0.0
 
+    # Track if this request is corrupted (NaNs in logits)
+    is_corrupted: bool = False
+
 
 @dataclass
 class FinishedRequestStats:
@@ -211,6 +209,7 @@ class FinishedRequestStats:
     inference_time: float = 0.0
     decode_time: float = 0.0
     mean_time_per_output_token: float = 0.0
+    is_corrupted: bool = False
 
 
 class IterationStats:
@@ -226,8 +225,7 @@ class IterationStats:
         self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
         self.inter_token_latencies_iter: list[float] = []
-        self.waiting_lora_adapters: dict[str, int] = {}
-        self.running_lora_adapters: dict[str, int] = {}
+        self.num_corrupted_reqs: int = 0
 
     def __repr__(self) -> str:
         field_to_value_str = ", ".join(f"{k}={v}" for k, v in vars(self).items())
@@ -244,7 +242,8 @@ class IterationStats:
         is_prefilling: bool,
         prompt_len: int,
         req_stats: RequestStateStats,
-        lora_stats: LoRAStats | None,
+        lora_states: "LoRARequestStates",
+        lora_name: str | None,
     ):
         num_new_generation_tokens = len(output.new_token_ids)
 
@@ -258,10 +257,24 @@ class IterationStats:
 
         req_stats.num_generation_tokens += num_new_generation_tokens
 
+        # Track if this request is corrupted (only check once per request)
+        # Early exit if already marked as corrupted to avoid redundant checks
+        if (
+            envs.VLLM_COMPUTE_NANS_IN_LOGITS
+            and not req_stats.is_corrupted
+            and output.num_nans_in_logits > 0
+        ):
+            req_stats.is_corrupted = True
+
         # Process request-level engine core events
         if output.events is not None:
             self.update_from_events(
-                output.request_id, output.events, is_prefilling, req_stats, lora_stats
+                output.request_id,
+                output.events,
+                is_prefilling,
+                req_stats,
+                lora_states,
+                lora_name,
             )
 
         # Process the batch-level "new tokens" engine core event
@@ -279,7 +292,8 @@ class IterationStats:
         events: list["EngineCoreEvent"],
         is_prefilling: bool,
         req_stats: RequestStateStats,
-        lora_stats: LoRAStats | None,
+        lora_states: "LoRARequestStates",
+        lora_name: str | None,
     ):
         # Avoid circular dependency
         from vllm.v1.engine import EngineCoreEventType
@@ -287,15 +301,14 @@ class IterationStats:
         for event in events:
             if event.type == EngineCoreEventType.QUEUED:
                 req_stats.queued_ts = event.timestamp
-                if lora_stats is not None:
-                    lora_stats.waiting_requests.add(req_id)
+                lora_states.request_waiting(req_id, lora_name)
             elif event.type == EngineCoreEventType.SCHEDULED:
                 if req_stats.scheduled_ts == 0.0:  # ignore preemptions
                     req_stats.scheduled_ts = event.timestamp
-                LoRARequestStates.scheduled_request(lora_stats, req_id)
+                lora_states.request_running(req_id, lora_name)
             elif event.type == EngineCoreEventType.PREEMPTED:
                 self.num_preempted_reqs += 1
-                LoRARequestStates.preempted_request(lora_stats, req_id)
+                lora_states.request_waiting(req_id, lora_name)
 
     def update_from_finished_request(
         self,
@@ -339,65 +352,69 @@ class IterationStats:
             inference_time=inference_time,
             decode_time=decode_time,
             mean_time_per_output_token=mean_time_per_output_token,
+            is_corrupted=req_stats.is_corrupted,
         )
         self.finished_requests.append(finished_req)
 
+        # Count corrupted requests when they finish (only once per request)
+        if req_stats.is_corrupted:
+            self.num_corrupted_reqs += 1
 
-class LoRARequestStates:
-    """Per-LoRA request state stats."""
+
+class LoRAStats:
+    """Tracks waiting and running request IDs for a single LoRA."""
 
     def __init__(self):
-        self.lora_name_to_stats: dict[str, LoRAStats] = {}
+        self.waiting: set[str] = set()
+        self.running: set[str] = set()
 
-    def get_stats(self, req_state: "RequestState") -> LoRAStats | None:
-        if req_state.lora_name is None:
-            return None
-        if req_state.lora_name not in self.lora_name_to_stats:
-            self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
-        return self.lora_name_to_stats[req_state.lora_name]
+    def update(self, req_id: str, waiting: bool, running: bool):
+        assert not (waiting and running)
+        if waiting:
+            self.waiting.add(req_id)
+        else:
+            self.waiting.discard(req_id)
 
-    def add_request(self, req_state: "RequestState"):
-        if (lora_stats := self.get_stats(req_state)) is not None:
-            lora_stats.waiting_requests.add(req_state.request_id)
+        if running:
+            self.running.add(req_id)
+        else:
+            self.running.discard(req_id)
 
-    def finish_request(self, req_state: "RequestState"):
-        if req_state.lora_name is None:
+    @property
+    def empty(self) -> bool:
+        return not (self.waiting or self.running)
+
+
+class LoRARequestStates:
+    """A per-LoRA count of running and waiting requests."""
+
+    def __init__(self, log_stats: bool = False):
+        self.log_stats = log_stats
+        self.requests: defaultdict[str, LoRAStats] = defaultdict(LoRAStats)
+
+    def _request_update(
+        self, req_id: str, lora_name: str | None, waiting: bool, running: bool
+    ):
+        if not self.log_stats or lora_name is None:
             return
-        lora_stats = self.lora_name_to_stats[req_state.lora_name]
-        lora_stats.running_requests.remove(req_state.request_id)
 
-    def abort_request(self, req_state: "RequestState"):
-        if req_state.lora_name is None:
-            return
-        lora_stats = self.lora_name_to_stats[req_state.lora_name]
-        lora_stats.waiting_requests.discard(req_state.request_id)
-        lora_stats.running_requests.discard(req_state.request_id)
+        lora_stats = self.requests[lora_name]
+        lora_stats.update(req_id, waiting, running)
+        if lora_stats.empty:
+            del self.requests[lora_name]
 
-    # Break the pattern for this lifecycle methods so we can
-    # call this from IterationStats.update_from_events()
-    @staticmethod
-    def scheduled_request(lora_stats: LoRAStats | None, request_id: str):
-        if lora_stats is None:
-            return
-        lora_stats.waiting_requests.remove(request_id)
-        lora_stats.running_requests.add(request_id)
+    def request_waiting(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=True, running=False)
 
-    @staticmethod
-    def preempted_request(lora_stats: LoRAStats | None, request_id: str):
-        if lora_stats is None:
-            return
-        lora_stats.running_requests.remove(request_id)
-        lora_stats.waiting_requests.add(request_id)
+    def request_running(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=False, running=True)
 
-    def update_iteration_stats(self, iteration_stats: IterationStats | None):
-        if iteration_stats is None:
+    def request_finished(self, req_id: str, lora_name: str | None):
+        self._request_update(req_id, lora_name, waiting=False, running=False)
+
+    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
+        if not self.log_stats or scheduler_stats is None:
             return
-        for lora_name, stats in self.lora_name_to_stats.items():
-            if stats.waiting_requests:
-                iteration_stats.waiting_lora_adapters[lora_name] = len(
-                    stats.waiting_requests
-                )
-            if stats.running_requests:
-                iteration_stats.running_lora_adapters[lora_name] = len(
-                    stats.running_requests
-                )
+        for lora_name, stats in self.requests.items():
+            scheduler_stats.waiting_lora_adapters[lora_name] = len(stats.waiting)
+            scheduler_stats.running_lora_adapters[lora_name] = len(stats.running)
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e7122ba339681..b5cba96e1026f 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -30,16 +30,23 @@ class LogprobsLists(NamedTuple):
         if self.cu_num_generated_tokens:
             start = self.cu_num_generated_tokens[start_req_idx]
             end = self.cu_num_generated_tokens[end_req_idx]
+            # Recompute cumulative array starting from 0
+            cu_num_offset = self.cu_num_generated_tokens[start_req_idx]
+            sliced_cu_num_generated_tokens = [
+                cu_num - cu_num_offset
+                for cu_num in self.cu_num_generated_tokens[
+                    start_req_idx : end_req_idx + 1
+                ]
+            ]
         else:
             start = start_req_idx
             end = end_req_idx
+            sliced_cu_num_generated_tokens = None
         return LogprobsLists(
             self.logprob_token_ids[start:end],
             self.logprobs[start:end],
             self.sampled_token_ranks[start:end],
-            self.cu_num_generated_tokens[start_req_idx:end_req_idx]
-            if self.cu_num_generated_tokens
-            else None,
+            sliced_cu_num_generated_tokens,
         )
 
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 864b0eb7fa410..7a5f1183ed48e 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -168,10 +168,6 @@ class Request:
     def use_structured_output(self) -> bool:
         return self.structured_output_request is not None
 
-    @property
-    def is_output_corrupted(self) -> bool:
-        return self.num_nans_in_logits > 0
-
     @property
     def num_tokens(self) -> int:
         return len(self._all_token_ids)
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 566de5bcda772..eb537eae6c904 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -13,6 +13,7 @@ import torch
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor
 from vllm.sampling_params import SamplingParams
+from vllm.utils.torch_utils import guard_cuda_initialization
 from vllm.v1.sample.logits_processor.builtin import (
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
@@ -72,8 +73,10 @@ def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
                 entrypoint.name,
                 entrypoint.value,
             )
-            classes.append(entrypoint.load())
+            with guard_cuda_initialization():
+                classes.append(entrypoint.load())
         except Exception as e:
+            logger.error("Failed to load LogitsProcessor plugin %s: %s", entrypoint, e)
             raise RuntimeError(
                 f"Failed to load LogitsProcessor plugin {entrypoint}"
             ) from e
@@ -126,8 +129,15 @@ def _load_logitsprocs_by_fqcns(
 
         try:
             # Load module
-            module = importlib.import_module(module_path)
+            with guard_cuda_initialization():
+                module = importlib.import_module(module_path)
         except Exception as e:
+            logger.error(
+                "Failed to load %sth LogitsProcessor plugin %s: %s",
+                ldx,
+                logitproc,
+                e,
+            )
             raise RuntimeError(
                 f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
             ) from e
@@ -206,6 +216,14 @@ def build_logitsprocs(
     )
 
 
+def validate_logits_processors_parameters(
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
+    sampling_params: SamplingParams,
+):
+    for logits_procs in _load_custom_logitsprocs(logits_processors):
+        logits_procs.validate_params(sampling_params)
+
+
 class AdapterLogitsProcessor(LogitsProcessor):
     """Wrapper for per-request logits processors
 
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index efa0f62ad6e1d..0cbfb187878a2 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -58,6 +58,14 @@ class BatchUpdate:
 
 
 class LogitsProcessor(ABC):
+    @classmethod
+    def validate_params(cls, sampling_params: SamplingParams):
+        """Validate sampling params for this logits processor.
+
+        Raise ValueError for invalid ones.
+        """
+        return None
+
     @abstractmethod
     def __init__(
         self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 898b90d41abae..241d9de957ea2 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -21,6 +21,14 @@ def apply_all_penalties(
     """
     _, vocab_size = logits.shape
     output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, logits.device)
+
+    # In the async scheduling case, rows that won't have penalties applied may contain
+    # -1 placeholder token ids. We must replace these with valid token ids so that the
+    # scatter done in apply_penalties is valid.
+    # NOTE(nick): The penalties implementation is currently quite inefficient and
+    # will be reworked anyhow.
+    output_tokens_t.masked_fill_(output_tokens_t == -1, vocab_size)
+
     return apply_penalties(
         logits,
         prompt_token_ids,
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7a4b224822bd8..02ea658b7f20e 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -127,15 +127,6 @@ class TopKTopPSampler(nn.Module):
         elif self.logprobs_mode == "processed_logprobs":
             logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
 
-        # Note: this is a workaround for
-        # https://github.com/pytorch/pytorch/pull/151218
-        @torch.compile(dynamic=True)
-        def compiled_random_sample(logits: torch.Tensor) -> torch.Tensor:
-            probs = logits.softmax(dim=-1, dtype=torch.float32)
-            q = torch.empty_like(probs)
-            q.exponential_()
-            return probs.div(q).argmax(dim=-1).view(-1)
-
         if len(generators) != logits.shape[0]:
             return compiled_random_sample(logits), logits_to_return
         else:
@@ -148,6 +139,16 @@ class TopKTopPSampler(nn.Module):
             return probs.div_(q).argmax(dim=-1).view(-1), logits_to_return
 
 
+# Note: this is a workaround for
+# https://github.com/pytorch/pytorch/pull/151218
+@torch.compile(dynamic=True)
+def compiled_random_sample(logits: torch.Tensor) -> torch.Tensor:
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+    q = torch.empty_like(probs)
+    q.exponential_()
+    return probs.div(q).argmax(dim=-1).view(-1)
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: torch.Tensor | None,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 35c2e73e8ee2c..75a4140fd6552 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -109,6 +109,7 @@ class EagleProposer:
             else []
         )
 
+        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(
             self.max_num_tokens, dtype=torch.int32, device=device
@@ -315,7 +316,12 @@ class EagleProposer:
             positions = target_positions[:, last_token_indices]
         else:
             positions = target_positions[last_token_indices]
-        if self.method in ("deepseek_mtp", "ernie_mtp", "longcat_flash_mtp"):
+        if self.method in (
+            "deepseek_mtp",
+            "ernie_mtp",
+            "longcat_flash_mtp",
+            "pangu_ultra_moe_mtp",
+        ):
             hidden_states = self.hidden_states[last_token_indices]
         else:
             hidden_states = hidden_states[last_token_indices]
@@ -939,7 +945,7 @@ class EagleProposer:
             self.vllm_config, DeepseekV32IndexerCache
         )
         draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names
-        self.attn_layer_names = list(draft_attn_layer_names)
+        self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names)
         self.indexer_layer_names = list(draft_indexer_layer_names)
 
         if self.indexer_layer_names:
@@ -1050,16 +1056,18 @@ class EagleProposer:
         num_tokens: int,
         use_cudagraphs=True,
     ) -> None:
-        if use_cudagraphs and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        # Determine if CUDA graphs should be used for this run.
+        cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
+        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
         with set_forward_context(
             None,
             self.vllm_config,
             num_tokens=num_tokens,
-            cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
-            if use_cudagraphs
-            else CUDAGraphMode.NONE,
+            cudagraph_runtime_mode=(
+                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
+            ),
         ):
             if self.supports_mm_inputs:
                 input_ids = None
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 150dde177ce8d..12b903ccaca97 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.v1.sample.metadata import SamplingMetadata
 
 # Initialize logger
@@ -56,6 +57,10 @@ class MedusaProposer:
                 vllm_config=self.vllm_config,
                 model_config=self.vllm_config.speculative_config.draft_model_config,
             )
+        assert not (
+            is_mixture_of_experts(self.model)
+            and self.vllm_config.parallel_config.enable_eplb
+        ), "EPLB for Medusa is not supported"
 
     @torch.inference_mode()
     def dummy_run(self, num_tokens: int) -> None:
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
new file mode 100644
index 0000000000000..049e335db3254
--- /dev/null
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.config import VllmConfig
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class SuffixDecodingProposer:
+    """
+    Speculative decoding proposer for Suffix Decoding (https://arxiv.org/pdf/2411.04975).
+    This class imports and uses the official implementation from Arctic Inference
+    (https://github.com/snowflakedb/ArcticInference).
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        config = vllm_config.speculative_config
+        self.num_speculative_tokens = config.num_speculative_tokens
+        self.max_tree_depth = config.suffix_decoding_max_tree_depth
+        self.max_spec_factor = config.suffix_decoding_max_spec_factor
+        self.min_token_prob = config.suffix_decoding_min_token_prob
+        self.max_model_len = vllm_config.model_config.max_model_len
+
+        # Lazy import to avoid error when Suffix Decoding is not used.
+        from arctic_inference.suffix_decoding import SuffixDecodingCache
+
+        # Initialize and empty cache. This object will take care of caching request
+        # outputs, evicting old requests, and manages the per-prompt suffix trees.
+        self.suffix_cache = SuffixDecodingCache(
+            max_tree_depth=config.suffix_decoding_max_tree_depth,
+            max_cached_requests=config.suffix_decoding_max_cached_requests,
+        )
+
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampled_token_ids: list[list[int]],
+    ) -> list[list[int]]:
+        """
+        Propose speculative tokens for each request in the input batch. Suffix Decoding
+        will speculate a dynamic number of tokens for each request every decoding step,
+        so each entry in the returned list may have different lengths.
+        """
+        draft_token_ids: list[list[int]] = []
+        for i, sampled_ids in enumerate(sampled_token_ids):
+            if not sampled_ids:
+                # Skip speculative decoding for partial prefills.
+                draft_token_ids.append([])
+                continue
+
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
+            req_id = input_batch.req_ids[i]
+            if req_id in input_batch.spec_decode_unsupported_reqs:
+                draft_token_ids.append([])
+                continue
+
+            num_tokens = input_batch.num_tokens_no_spec[i]
+            if num_tokens >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
+            index = input_batch.req_id_to_index[req_id]
+            if req_id not in self.suffix_cache.active_requests:
+                if req_id in self.suffix_cache.cached_requests:
+                    # Reset the suffix cache for this request.
+                    self.suffix_cache.evict_cached_response(req_id)
+                num_prompt_tokens = input_batch.num_prompt_tokens[index]
+                prompt_token_ids = input_batch.token_ids_cpu[index, :num_prompt_tokens]
+                # Start a new request, this will build the suffix tree for that prompt.
+                self.suffix_cache.start_request(req_id, prompt_token_ids)
+
+            # Append the newly sampled ids to the suffix cache for this request.
+            self.suffix_cache.add_active_response(req_id, sampled_ids)
+
+            # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
+            # we extract the pattern from the end of the input.
+            start = max(0, num_tokens - self.max_tree_depth)
+            pattern = input_batch.token_ids_cpu[i, start:num_tokens]
+            draft = self.suffix_cache.speculate(
+                req_id,
+                pattern,
+                max_spec_tokens=min(
+                    self.num_speculative_tokens, self.max_model_len - num_tokens - 1
+                ),
+                max_spec_factor=self.max_spec_factor,
+                min_token_prob=self.min_token_prob,
+            )
+
+            draft_token_ids.append(draft.token_ids)
+
+        # Stop requests that were not seen in the input batch.
+        for req_id in (
+            self.suffix_cache.active_requests - input_batch.req_id_to_index.keys()
+        ):
+            self.suffix_cache.stop_request(req_id)
+
+        return draft_token_ids
+
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 6f9dbeabd8ca6..029129cf1a475 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -64,6 +64,15 @@ class StructuredOutputManager:
             self.tokenizer = init_tokenizer_from_configs(
                 model_config=self.vllm_config.model_config
             )
+            reasoning_parser = (
+                self.vllm_config.structured_outputs_config.reasoning_parser
+            )
+            reasoning_parser_plugin = (
+                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+            )
+            if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
+                ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
+
             reasoning_parser = (
                 self.vllm_config.structured_outputs_config.reasoning_parser
             )
@@ -260,9 +269,10 @@ class StructuredOutputManager:
                         and token is not None
                         and not structured_output_request.grammar.is_terminated()
                     ):
-                        assert structured_output_request.grammar.accept_tokens(
+                        accepted = structured_output_request.grammar.accept_tokens(
                             req_id, [token]
                         )
+                        assert accepted, (token, req_id, scheduled_spec_decode_tokens)
                         state_advancements += 1
                     cumulative_index += 1
                 if state_advancements > 0:
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 00a625e103bd3..2962a439dcb3e 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -111,6 +111,7 @@ class GuidanceGrammar(StructuredOutputGrammar):
     vocab_size: int
     printed_error: bool = False
     terminated: bool = False
+    rollback_lag: int = 0
 
     def check_error(self):
         if not self.printed_error:
@@ -127,6 +128,8 @@ class GuidanceGrammar(StructuredOutputGrammar):
         """
 
         if self.ll_tokenizer.eos_token in tokens:
+            if self.ll_matcher.is_stopped() and not self.terminated:
+                self.rollback_lag = 1
             self.terminated = True
 
         if self.ll_matcher.is_stopped():
@@ -163,8 +166,11 @@ class GuidanceGrammar(StructuredOutputGrammar):
         return tokens[:num_tokens]
 
     def rollback(self, num_tokens: int) -> None:
-        self.ll_matcher.rollback(num_tokens)
-        self.check_error()
+        if num_tokens > 0:
+            self.ll_matcher.rollback(num_tokens - self.rollback_lag)
+            self.terminated = False
+            self.rollback_lag = 0
+            self.check_error()
 
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         # this will automatically return [EOS] mask if the matcher is stopped
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index e041015e56e9f..c28bf542f85c5 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -22,6 +22,7 @@ class BlockTable:
         pin_memory: bool,
         device: torch.device,
         kernel_block_size: int,
+        dcp_kv_cache_interleave_size: int,
     ):
         """
         Args:
@@ -86,6 +87,7 @@ class BlockTable:
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size
 
     def append_row(
         self,
@@ -144,9 +146,19 @@ class BlockTable:
             # Use virtual_block_size for mask calculation, which marks local
             # tokens.
             virtual_block_offsets = positions % virtual_block_size
-            mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
+            mask = (
+                virtual_block_offsets
+                // self.dcp_kv_cache_interleave_size
+                % self.dcp_world_size
+                == self.dcp_rank
+            )
             # Calculate local block_offsets
-            block_offsets = virtual_block_offsets // self.dcp_world_size
+            block_offsets = (
+                virtual_block_offsets
+                // (self.dcp_world_size * self.dcp_kv_cache_interleave_size)
+                * self.dcp_kv_cache_interleave_size
+                + virtual_block_offsets % self.dcp_kv_cache_interleave_size
+            )
             # Calculate slot_mapping
             slot_mapping = block_numbers * self.block_size + block_offsets
             # Write final slots, use -1 for not-local
@@ -234,6 +246,7 @@ class MultiGroupBlockTable:
         block_sizes: list[int],
         kernel_block_sizes: list[int],
         num_speculative_tokens: int = 0,
+        dcp_kv_cache_interleave_size: int = 1,
     ) -> None:
         # Note(hc): each dcp rank only store
         # (max_model_len//dcp_world_size) tokens in kvcache,
@@ -263,6 +276,7 @@ class MultiGroupBlockTable:
                 pin_memory,
                 device,
                 kernel_block_size,
+                dcp_kv_cache_interleave_size,
             )
             for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes)
         ]
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 5b57df2d472c8..4420a057d1e58 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -37,6 +37,28 @@ class CPUWorker(Worker):
 
         self.parallel_config.disable_custom_all_reduce = True
 
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
+                ),
+            )
+        else:
+            self.profiler = None
+
     def init_device(self):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
@@ -47,13 +69,15 @@ class CPUWorker(Worker):
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
                     lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
                 )
-            elif current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+            elif cpu_arch == CpuArchEnum.X86:
                 # For x86 SMT-2, use 1 CPU per core
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
                     lambda cpus: cpus[-1:]
                 )
             else:
-                self.local_omp_cpuid = "all"
+                self.local_omp_cpuid = "nobind"
+        elif omp_cpuids == "nobind":
+            self.local_omp_cpuid = "nobind"
         else:
             local_dp_rank = self.parallel_config.data_parallel_rank_local
             omp_cpuids = omp_cpuids.split("|")
@@ -64,7 +88,7 @@ class CPUWorker(Worker):
                 ]
             self.local_omp_cpuid = omp_cpuids[self.rank]
 
-        if self.local_omp_cpuid != "all":
+        if self.local_omp_cpuid != "nobind":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
@@ -166,3 +190,17 @@ class CPUWorker(Worker):
             [(x.id, x.physical_core) for x in logical_cpu_list],
         )
         return ",".join([str(x.id) for x in logical_cpu_list])
+
+    def profile(self, is_start: bool = True):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        if is_start:
+            self.profiler.start()
+        else:
+            self.profiler.stop()
+            if self.local_rank == 0:
+                logger.info(
+                    self.profiler.key_averages().table(
+                        sort_by="self_cpu_time_total", row_limit=50
+                    )
+                )
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fe834db115e70..393181f543d2e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -84,6 +84,7 @@ class InputBatch:
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
         num_speculative_tokens: int = 0,
+        dcp_kv_cache_interleave_size: int = 1,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -137,6 +138,7 @@ class InputBatch:
             block_sizes=block_sizes,
             kernel_block_sizes=kernel_block_sizes,
             num_speculative_tokens=num_speculative_tokens,
+            dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
         )
 
         # Sampling-related.
@@ -859,22 +861,24 @@ class InputBatch:
         return prompt_token_ids_cpu_tensor.to(device=self.device, non_blocking=True)
 
     def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
+        self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
     ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return
         datastructures used to activate the current LoRAs.
         Returns:
-            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
-               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            1. prompt_lora_mapping: A tuple of size np.sum(num_sampled_tokens)
+               where, prompt_lora_mapping[i] is the LoRA id to use for the ith
+               sampled token.
             2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
                where, token_lora_mapping[i] is the LoRA id to use for ith token.
             3. lora_requests: Set of relevant LoRA requests.
         """
 
         req_lora_mapping = self.request_lora_mapping[: self.num_reqs]
-        prompt_lora_mapping = tuple(req_lora_mapping)
+        prompt_lora_mapping = tuple(req_lora_mapping.repeat(num_sampled_tokens))
         token_lora_mapping = tuple(req_lora_mapping.repeat(num_scheduled_tokens))
+
         active_lora_requests: set[LoRARequest] = set(
             self.lora_id_to_lora_request.values()
         )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 66a9d72912618..6fccf2ea2f47c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -20,7 +20,11 @@ from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm.attention import Attention, AttentionType
-from vllm.attention.backends.abstract import AttentionBackend, MultipleOf
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    MultipleOf,
+)
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
@@ -35,6 +39,7 @@ from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.distributed.parallel_state import (
+    get_dcp_group,
     get_pp_group,
     get_tp_group,
     graph_capture,
@@ -81,13 +86,13 @@ from vllm.utils.torch_utils import (
     kv_cache_dtype_str_to_dtype,
     supports_dynamo,
 )
-from vllm.v1.attention.backends.flash_attn import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
+    get_dcp_local_seq_lens,
     reorder_batch_to_split_decodes_and_prefills,
     split_attn_metadata,
 )
@@ -125,6 +130,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
@@ -273,7 +279,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This will be overridden in load_model()
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
+
+        # Always set to false after the first forward pass
+        self.calculate_kv_scales = self.cache_config.calculate_kv_scales
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
+        self.dcp_rank = 0 if self.dcp_world_size <= 1 else get_dcp_group().rank_in_group
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
 
@@ -336,16 +346,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the last PP rank. This is not ideal if there are many
         # layers in the draft model.
         if self.speculative_config and get_pp_group().is_last_rank:
+            self.drafter: (
+                NgramProposer | SuffixDecodingProposer | EagleProposer | MedusaProposer
+            )
             if self.speculative_config.method == "ngram":
                 self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.method == "suffix":
+                self.drafter = SuffixDecodingProposer(self.vllm_config)
             elif self.speculative_config.use_eagle():
-                self.drafter = EagleProposer(self.vllm_config, self.device, self)  # type: ignore
+                self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
                     self.use_aux_hidden_state_outputs = True
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config, device=self.device
-                )  # type: ignore
+                )
             else:
                 raise ValueError(
                     "Unknown speculative decoding method: "
@@ -390,6 +405,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # uses output token ids so we set this conservatively.
             logitsprocs_need_output_token_ids=bool(custom_logitsprocs),
             is_pooling_model=self.is_pooling_model,
+            dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size,
         )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
@@ -518,7 +534,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
         self.transfer_event = torch.cuda.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
-            (self.max_model_len, 1),
+            (self.max_num_reqs, 1),
             dtype=torch.int64,
             device="cpu",
             pin_memory=self.pin_memory,
@@ -932,6 +948,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             mm_kwargs_combined.update(mm_kwargs_group)
 
@@ -1043,7 +1060,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _get_encoder_seq_lens(
         self,
-        scheduler_output: "SchedulerOutput",
+        scheduled_encoder_inputs: dict[str, list[int]],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
     ) -> np.ndarray | None:
@@ -1053,31 +1070,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
         encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
-        for req_id in scheduler_output.scheduled_encoder_inputs:
+        for req_id in scheduled_encoder_inputs:
             req_index = self.input_batch.req_id_to_index[req_id]
             encoder_seq_lens[req_index] = self.max_encoder_len
 
         return encoder_seq_lens
 
     def _prepare_inputs(
-        self, scheduler_output: "SchedulerOutput"
+        self,
+        scheduler_output: "SchedulerOutput",
+        num_scheduled_tokens: np.ndarray,
+        max_num_scheduled_tokens: int,
     ) -> tuple[
-        PerLayerAttnMetadata,
         torch.Tensor,
         SpecDecodeMetadata | None,
-        np.ndarray,
-        CommonAttentionMetadata | None,
-        int,
         UBatchSlices | None,
         torch.Tensor | None,
-        bool,
     ]:
         """
         :return: tuple[
-            attn_metadata: layer-to-attention_metadata mapping,
             logits_indices, spec_decode_metadata,
-            num_scheduled_tokens, spec_decode_common_attn_metadata,
-            max_num_scheduled_tokens, use_cascade_attn
+            ubatch_slices, num_tokens_across_dp,
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -1089,12 +1102,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit_block_table(num_reqs)
 
-        # Get the number of scheduled tokens for each request.
-        req_ids = self.input_batch.req_ids
-        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-        num_scheduled_tokens = np.array(tokens, dtype=np.int32)
-        max_num_scheduled_tokens = max(tokens)
-
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
         req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens)
@@ -1221,8 +1228,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Fill unused with 0 for full cuda graph mode.
         self.seq_lens.np[num_reqs:].fill(0)
         self.seq_lens.copy_to_gpu()
-        seq_lens = self.seq_lens.gpu[:num_reqs]
-        max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
         num_tokens = [self.requests[r].num_tokens for r in self.input_batch.req_ids]
         num_tokens_np = np.array(num_tokens, dtype=np.int32)
@@ -1261,6 +1266,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logits_indices = query_start_loc[1:] - 1
             num_draft_tokens = None
             spec_decode_metadata = None
+            num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
         else:
             # Get the number of draft tokens for each request.
             # Iterate over the dictionary rather than all requests since not all
@@ -1287,30 +1293,88 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 num_draft_tokens, cu_num_tokens
             )
             logits_indices = spec_decode_metadata.logits_indices
-
+            num_sampled_tokens = num_draft_tokens + 1
             # For DECODE only cuda graph of some attention backends (e.g., GDN).
             self.num_decode_draft_tokens.np[:num_reqs] = num_decode_draft_tokens
             self.num_decode_draft_tokens.np[num_reqs:].fill(-1)
             self.num_decode_draft_tokens.copy_to_gpu()
 
-        logits_indices_padded = None
-        if self.cache_config.kv_sharing_fast_prefill:
-            logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
-                logits_indices
+        # Hot-Swap lora model
+        if self.lora_config:
+            assert (
+                np.sum(num_sampled_tokens)
+                <= self.vllm_config.scheduler_config.max_num_batched_tokens
             )
+            self.set_active_loras(
+                self.input_batch, num_scheduled_tokens, num_sampled_tokens
+            )
+
+        return (
+            logits_indices,
+            spec_decode_metadata,
+            ubatch_slices,
+            num_tokens_across_dp,
+        )
+
+    def _build_attention_metadata(
+        self,
+        total_num_scheduled_tokens: int,
+        max_num_scheduled_tokens: int,
+        num_reqs: int,
+        ubatch_slices: UBatchSlices | None = None,
+        logits_indices: torch.Tensor | None = None,
+        use_spec_decode: bool = False,
+        for_cudagraph_capture: bool = False,
+        scheduled_encoder_inputs: dict[str, list[int]] | None = None,
+        cascade_attn_prefix_lens: list[list[int]] | None = None,
+    ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
+        """
+        :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
+        """
+        logits_indices_padded = None
+        num_logits_indices = 0
+        if logits_indices is not None:
+            num_logits_indices = logits_indices.size(0)
+            if self.cache_config.kv_sharing_fast_prefill:
+                logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
+                    logits_indices
+                )
+
+        # update seq_lens of decode reqs under DCP.
+        if self.dcp_world_size > 1:
+            self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
+                self.seq_lens.cpu[:num_reqs],
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.parallel_config.dcp_kv_cache_interleave_size,
+            )
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
-        use_cascade_attn = False
 
-        # Used in the below loop.
+        # Used in the below loop
+        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
         query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1]
+        seq_lens = self.seq_lens.gpu[:num_reqs]
         seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
         num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
             :num_reqs
         ]
+        dcp_local_seq_lens = (
+            self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None
+        )
         spec_decode_common_attn_metadata = None
+
+        if for_cudagraph_capture:
+            # For some attention backends (e.g. FA) with sliding window models we need
+            # to make sure the backend see a max_seq_len that is larger to the sliding
+            # window size when capturing to make sure the correct kernel is selected.
+            max_seq_len = self.max_model_len
+        else:
+            max_seq_len = self.seq_lens.np[:num_reqs].max().item()
+
         if use_spec_decode:
             self.num_accepted_tokens.np[:num_reqs] = (
                 self.input_batch.num_accepted_tokens_cpu[:num_reqs]
@@ -1320,14 +1384,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
-        for kv_cache_group_id, kv_cache_group_spec in enumerate(
+        for kv_cache_gid, kv_cache_group in enumerate(
             self.kv_cache_config.kv_cache_groups
         ):
             encoder_seq_lens = self._get_encoder_seq_lens(
-                scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs
+                scheduled_encoder_inputs or {},
+                kv_cache_group.kv_cache_spec,
+                num_reqs,
             )
 
-            if isinstance(kv_cache_group_spec.kv_cache_spec, EncoderOnlyAttentionSpec):
+            if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
@@ -1340,18 +1406,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     dtype=torch.int64,
                     device=self.device,
                 )
-                num_common_prefix_blocks = 0
             else:
-                blk_table = self.input_batch.block_table[kv_cache_group_id]
+                blk_table = self.input_batch.block_table[kv_cache_gid]
                 blk_table_tensor = blk_table.get_device_tensor(num_reqs)
                 slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens]
 
                 # Fill unused with -1. Needed for reshape_and_cache in full cuda
                 # graph mode.
                 blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1)
-                num_common_prefix_blocks = scheduler_output.num_common_prefix_blocks[
-                    kv_cache_group_id
-                ]
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -1366,35 +1428,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
                 logits_indices_padded=logits_indices_padded,
-                num_logits_indices=logits_indices.size(0),
+                num_logits_indices=num_logits_indices,
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
-                dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs]
-                if self.dcp_world_size > 1
-                else None,
+                dcp_local_seq_lens=dcp_local_seq_lens,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
                 if isinstance(self.drafter, EagleProposer):
-                    if (
-                        self.drafter.attn_layer_names[0]
-                        in kv_cache_group_spec.layer_names
-                    ):
+                    if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
                         spec_decode_common_attn_metadata = common_attn_metadata
                 else:
                     spec_decode_common_attn_metadata = common_attn_metadata
 
-            for attn_group in self.attn_groups[kv_cache_group_id]:
-                # Prepare for cascade attention if enabled & beneficial.
-                common_prefix_len = 0
+            for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]):
+                cascade_attn_prefix_len = (
+                    cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
+                    if cascade_attn_prefix_lens
+                    else 0
+                )
                 builder = attn_group.get_metadata_builder()
-                if self.cascade_attn_enabled:
-                    common_prefix_len = self._compute_cascade_attn_prefix_len(
-                        num_scheduled_tokens,
-                        num_common_prefix_blocks,
-                        attn_group.kv_cache_spec,
-                        builder,
-                    )
 
                 extra_attn_metadata_args = {}
                 if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
@@ -1412,45 +1465,69 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     for ubid, common_attn_metadata in enumerate(
                         common_attn_metadata_list
                     ):
-                        attn_metadata_i = attn_group.get_metadata_builder(
-                            ubatch_id=ubid
-                        ).build(
-                            common_prefix_len=common_prefix_len,
-                            common_attn_metadata=common_attn_metadata,
-                        )
-                        for layer_name in kv_cache_group_spec.layer_names:
+                        builder = attn_group.get_metadata_builder(ubatch_id=ubid)
+                        if for_cudagraph_capture:
+                            attn_metadata_i = builder.build_for_cudagraph_capture(
+                                common_attn_metadata
+                            )
+                        else:
+                            attn_metadata_i = builder.build(
+                                common_prefix_len=cascade_attn_prefix_len,
+                                common_attn_metadata=common_attn_metadata,
+                            )
+                        for layer_name in kv_cache_group.layer_names:
                             assert type(attn_metadata) is list
                             attn_metadata[ubid][layer_name] = attn_metadata_i
                 else:
                     assert isinstance(attn_metadata, dict)
-                    attn_metadata_i = builder.build(
-                        common_prefix_len=common_prefix_len,
-                        common_attn_metadata=common_attn_metadata,
-                        **extra_attn_metadata_args,
-                    )
-                    use_cascade_attn |= getattr(attn_metadata_i, "use_cascade", False)
+                    if for_cudagraph_capture:
+                        attn_metadata_i = builder.build_for_cudagraph_capture(
+                            common_attn_metadata
+                        )
+                    else:
+                        attn_metadata_i = builder.build(
+                            common_prefix_len=cascade_attn_prefix_len,
+                            common_attn_metadata=common_attn_metadata,
+                            **extra_attn_metadata_args,
+                        )
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
-        # disable cascade attention when DBO
-        if ubatch_slices is not None:
-            use_cascade_attn = False
+        return attn_metadata, spec_decode_common_attn_metadata
 
-        # Hot-Swap lora model
-        if self.lora_config:
-            self.set_active_loras(self.input_batch, num_scheduled_tokens)
+    def _compute_cascade_attn_prefix_lens(
+        self,
+        num_scheduled_tokens: np.ndarray,
+        num_common_prefix_blocks: list[int],
+    ) -> list[list[int]] | None:
+        """
+        :return: Optional[cascade_attn_prefix_lens]
+            cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
+            None if we should not use cascade attention
+        """
 
-        return (
-            attn_metadata,
-            logits_indices,
-            spec_decode_metadata,
-            num_scheduled_tokens,
-            spec_decode_common_attn_metadata,
-            max_num_scheduled_tokens,
-            ubatch_slices,
-            num_tokens_across_dp,
-            use_cascade_attn,
-        )
+        use_cascade_attn = False
+        num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups)
+        cascade_attn_prefix_lens: list[list[int]] = [
+            [] for _ in range(num_kv_cache_groups)
+        ]
+
+        for kv_cache_gid in range(num_kv_cache_groups):
+            for attn_group in self.attn_groups[kv_cache_gid]:
+                if isinstance(attn_group.kv_cache_spec, EncoderOnlyAttentionSpec):
+                    cascade_attn_prefix_len = 0
+                else:
+                    # 0 if cascade attention should not be used
+                    cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len(
+                        num_scheduled_tokens,
+                        num_common_prefix_blocks[kv_cache_gid],
+                        attn_group.kv_cache_spec,
+                        attn_group.get_metadata_builder(),
+                    )
+                cascade_attn_prefix_lens[kv_cache_gid].append(cascade_attn_prefix_len)
+                use_cascade_attn |= cascade_attn_prefix_len > 0
+
+        return cascade_attn_prefix_lens if use_cascade_attn else None
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1476,6 +1553,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Returns:
             int: Length of common prefix in tokens.
         """
+
         common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
         if common_prefix_len == 0:
             # Common case.
@@ -1762,6 +1840,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             curr_group_outputs = []
 
@@ -1788,6 +1867,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                             device=self.device,
                             pin_memory=self.pin_memory,
                             merge_by_field_config=model.merge_by_field_config,
+                            multimodal_cpu_fields=model.multimodal_cpu_fields,
                         )
                     )
 
@@ -1930,6 +2010,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Add the grouped features to encoder_features dict
             # This allows the model to receive them as kwargs (e.g.,
@@ -2040,7 +2121,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model = self.get_model()
         assert is_mixture_of_experts(model)
         self.eplb_state.step(
-            model,
             is_dummy,
             is_profile,
             log_stats=self.parallel_config.eplb_config.log_balancedness,
@@ -2448,7 +2528,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 "after execute_model() returns None."
             )
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        with record_function_or_nullcontext("Preprocess"):
+        with record_function_or_nullcontext("gpu_model_runner: preprocess"):
             with self.synchronize_input_prep():
                 # Update persistent batch states.
                 self._update_states(scheduler_output)
@@ -2467,18 +2547,48 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         "it when the requests need prompt logprobs"
                     )
 
-                # Prepare the decoder inputs.
+                num_reqs = self.input_batch.num_reqs
+                req_ids = self.input_batch.req_ids
+                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+                num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
+                max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+
                 (
-                    attn_metadata,
                     logits_indices,
                     spec_decode_metadata,
-                    num_scheduled_tokens_np,
-                    spec_decode_common_attn_metadata,
-                    max_query_len,
                     ubatch_slices,
                     num_tokens_across_dp,
-                    use_cascade_attn,
-                ) = self._prepare_inputs(scheduler_output)
+                ) = self._prepare_inputs(
+                    scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens
+                )
+
+                cascade_attn_prefix_lens = None
+                # Disable cascade attention when using microbatching (DBO)
+                if self.cascade_attn_enabled and ubatch_slices is None:
+                    # Pre-compute cascade attention prefix lengths
+                    # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state
+                    cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
+                        num_scheduled_tokens_np,
+                        scheduler_output.num_common_prefix_blocks,
+                    )
+
+                # TODO(lucas): move cudagraph dispatching here:
+                #   https://github.com/vllm-project/vllm/issues/23789
+
+                total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
+                attn_metadata, spec_decode_common_attn_metadata = (
+                    self._build_attention_metadata(
+                        total_num_scheduled_tokens=total_num_scheduled_tokens,
+                        max_num_scheduled_tokens=max_num_scheduled_tokens,
+                        num_reqs=num_reqs,
+                        ubatch_slices=ubatch_slices,
+                        logits_indices=logits_indices,
+                        use_spec_decode=use_spec_decode,
+                        scheduled_encoder_inputs=scheduler_output.scheduled_encoder_inputs,
+                        cascade_attn_prefix_lens=cascade_attn_prefix_lens,
+                    )
+                )
 
             dp_rank = self.parallel_config.data_parallel_rank
             if ubatch_slices:
@@ -2502,29 +2612,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 scheduler_output, num_input_tokens, intermediate_tensors
             )
 
-            uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
-                num_scheduled_tokens == self.input_batch.num_reqs * max_query_len
-            )
+            uniform_decode = (
+                max_num_scheduled_tokens == self.uniform_decode_query_len
+            ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
             batch_descriptor = BatchDescriptor(
                 num_tokens=num_input_tokens,
                 uniform_decode=uniform_decode,
                 has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
             )
             cudagraph_runtime_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(batch_descriptor, use_cascade_attn)
+                self.cudagraph_dispatcher.dispatch(
+                    batch_descriptor,
+                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                )
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
-        if attn_metadata is not None:
-            metadata_list = (
-                attn_metadata.values()
-                if isinstance(attn_metadata, dict)
-                else [attn_metadata]
-            )
-            if any(
-                getattr(m, "enable_kv_scales_calculation", False) for m in metadata_list
-            ):
-                cudagraph_runtime_mode = CUDAGraphMode.NONE
+        # KV scales calculation involves dynamic operations that are incompatible
+        # with CUDA graph capture.
+        if self.calculate_kv_scales:
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            # Mark KV scales as calculated after the first forward pass
+            self.calculate_kv_scales = False
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -2538,7 +2647,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 batch_descriptor=batch_descriptor,
                 ubatch_slices=ubatch_slices,
             ),
-            record_function_or_nullcontext("Forward"),
+            record_function_or_nullcontext("gpu_model_runner: forward"),
             self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
         ):
             model_output = self._model_forward(
@@ -2549,7 +2658,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 **model_kwargs,
             )
 
-        with record_function_or_nullcontext("Postprocess"):
+        with record_function_or_nullcontext("gpu_model_runner: postprocess"):
             if self.use_aux_hidden_state_outputs:
                 # True when EAGLE 3 is used.
                 hidden_states, aux_hidden_states = model_output
@@ -2646,12 +2755,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 scheduler_output, grammar_output, self.input_batch, logits
             )
 
-        with record_function_or_nullcontext("Sample"):
+        with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
         def propose_draft_token_ids(sampled_token_ids):
             assert spec_decode_common_attn_metadata is not None
-            with record_function_or_nullcontext("Draft"):
+            with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
                     scheduler_output,
                     sampled_token_ids,
@@ -2689,7 +2798,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # as inputs, and does not need to wait for bookkeeping to finish.
             propose_draft_token_ids(sampler_output.sampled_token_ids)
 
-        with record_function_or_nullcontext("Bookkeep"):
+        with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
                 num_nans_in_logits,
                 logprobs_lists,
@@ -2716,37 +2825,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # tokens on the CPU, so they are run after bookkeeping.
             propose_draft_token_ids(valid_sampled_token_ids)
 
-        with record_function_or_nullcontext("EPLB"):
+        with record_function_or_nullcontext("gpu_model_runner: eplb"):
             self.eplb_step()
-
-        output = ModelRunnerOutput(
-            req_ids=req_ids_output_copy,
-            req_id_to_index=req_id_to_index_output_copy,
-            sampled_token_ids=valid_sampled_token_ids,
-            logprobs=logprobs_lists,
-            prompt_logprobs_dict=prompt_logprobs_dict,
-            pooler_output=[],
-            kv_connector_output=kv_connector_output,
-            num_nans_in_logits=num_nans_in_logits,
-        )
+        with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
+            output = ModelRunnerOutput(
+                req_ids=req_ids_output_copy,
+                req_id_to_index=req_id_to_index_output_copy,
+                sampled_token_ids=valid_sampled_token_ids,
+                logprobs=logprobs_lists,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+                kv_connector_output=kv_connector_output,
+                num_nans_in_logits=num_nans_in_logits,
+            )
 
         if not self.use_async_scheduling:
             return output
-
-        async_output = AsyncGPUModelRunnerOutput(
-            model_runner_output=output,
-            sampled_token_ids=sampler_output.sampled_token_ids,
-            logprobs_tensors=sampler_output.logprobs_tensors,
-            invalid_req_indices=invalid_req_indices,
-            async_output_copy_stream=self.async_output_copy_stream,
-        )
-
-        # Save ref of sampled_token_ids CPU tensor if the batch contains
-        # any requests with sampling params that that require output ids.
-        self.input_batch.set_async_sampled_token_ids(
-            async_output.sampled_token_ids_cpu,
-            async_output.async_copy_ready_event,
-        )
+        with record_function_or_nullcontext(
+            "gpu_model_runner: AsyncGPUModelRunnerOutput"
+        ):
+            async_output = AsyncGPUModelRunnerOutput(
+                model_runner_output=output,
+                sampled_token_ids=sampler_output.sampled_token_ids,
+                logprobs_tensors=sampler_output.logprobs_tensors,
+                invalid_req_indices=invalid_req_indices,
+                async_output_copy_stream=self.async_output_copy_stream,
+            )
+        with record_function_or_nullcontext(
+            "gpu_model_runner: set_async_sampled_token_ids"
+        ):
+            # Save ref of sampled_token_ids CPU tensor if the batch contains
+            # any requests with sampling params that that require output ids.
+            self.input_batch.set_async_sampled_token_ids(
+                async_output.sampled_token_ids_cpu,
+                async_output.async_copy_ready_event,
+            )
 
         return async_output
 
@@ -2783,6 +2896,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.input_batch.token_ids_cpu,
                 self.input_batch.spec_decode_unsupported_reqs,
             )
+        elif self.speculative_config.method == "suffix":
+            assert isinstance(sampled_token_ids, list)
+            assert isinstance(self.drafter, SuffixDecodingProposer)
+            draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids)
         elif self.speculative_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
@@ -2793,7 +2910,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 indices = []
                 offset = 0
-                assert spec_decode_metadata is not None
+                assert spec_decode_metadata is not None, (
+                    "No spec decode metadata for medusa"
+                )
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
@@ -2924,32 +3043,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.model_config.model,
             scope="global",
         )
-        if eep_scale_up:
-            from vllm.distributed.parallel_state import get_ep_group
-
-            num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu")
-            torch.distributed.broadcast(
-                num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0
-            )
-            num_local_physical_experts = int(num_local_physical_experts.item())
-            new_ep_size = get_ep_group().world_size
-            global_expert_load, old_global_expert_indices = EplbState.recv_state()
-            num_logical_experts = global_expert_load.shape[1]
-            self.parallel_config.eplb_config.num_redundant_experts = (
-                num_local_physical_experts * new_ep_size - num_logical_experts
-            )
-            assert old_global_expert_indices.shape[1] % num_local_physical_experts == 0
-            old_ep_size = (
-                old_global_expert_indices.shape[1] // num_local_physical_experts
-            )
-            rank_mapping = {
-                old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)
-            }
-        else:
-            global_expert_load = None
-            old_global_expert_indices = None
-            rank_mapping = None
+        global_expert_loads, old_global_expert_indices_per_model, rank_mapping = (
+            EplbState.get_eep_state(self.parallel_config)
+            if eep_scale_up
+            else (None, None, None)
+        )
 
+        if self.parallel_config.enable_eplb:
+            self.eplb_state = EplbState(self.parallel_config, self.device)
+            eplb_models = 0
         with DeviceMemoryProfiler() as m:
             time_before_load = time.perf_counter()
             model_loader = get_model_loader(self.load_config)
@@ -2961,8 +3063,39 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.model, self.vllm_config, self.device
                 )
             if hasattr(self, "drafter"):
-                logger.info("Loading drafter model...")
+                logger.info_once("Loading drafter model...")
                 self.drafter.load_model(self.model)
+                if (
+                    hasattr(self.drafter, "model")
+                    and is_mixture_of_experts(self.drafter.model)
+                    and self.parallel_config.enable_eplb
+                ):
+                    logger.info_once(
+                        "EPLB is enabled for drafter model %s.",
+                        self.vllm_config.speculative_config.draft_model_config.model,
+                    )
+
+                    global_expert_load = (
+                        global_expert_loads[eplb_models]
+                        if global_expert_loads
+                        else None
+                    )
+                    old_global_expert_indices = (
+                        old_global_expert_indices_per_model[eplb_models]
+                        if old_global_expert_indices_per_model
+                        else None
+                    )
+                    if self.eplb_state is None:
+                        self.eplb_state = EplbState(self.parallel_config, self.device)
+                    self.eplb_state.add_model(
+                        self.drafter.model,
+                        self.vllm_config.speculative_config.draft_model_config,
+                        global_expert_load,
+                        old_global_expert_indices,
+                        rank_mapping,
+                    )
+                    eplb_models += 1
+
             if self.use_aux_hidden_state_outputs:
                 if not supports_eagle3(self.get_model()):
                     raise RuntimeError(
@@ -2985,24 +3118,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
         logger.info_once(
-            "Model loading took %.4f GiB and %.6f seconds",
+            "Model loading took %.4f GiB memory and %.6f seconds",
             self.model_memory_usage / GiB_bytes,
             time_after_load - time_before_load,
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
-
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())
             and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
         )
 
         if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
-            logger.info("EPLB is enabled for model %s.", self.model_config.model)
-            self.eplb_state = EplbState.build(
+            logger.info_once("EPLB is enabled for model %s.", self.model_config.model)
+            global_expert_load = (
+                global_expert_loads[eplb_models] if global_expert_loads else None
+            )
+            old_global_expert_indices = (
+                old_global_expert_indices_per_model[eplb_models]
+                if old_global_expert_indices_per_model
+                else None
+            )
+            assert self.eplb_state is not None
+            self.eplb_state.add_model(
                 self.model,
-                self.device,
-                self.parallel_config,
+                self.model_config,
                 global_expert_load,
                 old_global_expert_indices,
                 rank_mapping,
@@ -3260,6 +3400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 device=self.device,
                 pin_memory=self.pin_memory,
                 merge_by_field_config=model.merge_by_field_config,
+                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 
@@ -3353,6 +3494,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
         total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
+        num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
         # Disable DP padding when running eager
         allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -3378,10 +3520,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # If force_attention is True, we always capture attention. Otherwise,
         # it only happens for cudagraph_runtime_mode=FULL.
         if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
-            attn_metadata = {}
-            if ubatch_slices is not None:
-                attn_metadata = [dict() for _ in range(len(ubatch_slices))]
-
             if create_mixed_batch:
                 # In the mixed batch mode (used for FI warmup), we use
                 # shorter sequence lengths to run faster.
@@ -3397,58 +3535,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
             self.query_start_loc.copy_to_gpu()
 
-            for kv_cache_group_id, kv_cache_group_spec in enumerate(
-                self.kv_cache_config.kv_cache_groups
-            ):
-                common_attn_metadata = CommonAttentionMetadata(
-                    query_start_loc=self.query_start_loc.gpu[: num_reqs + 1],
-                    query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs + 1],
-                    seq_lens=self.seq_lens.gpu[:num_reqs],
-                    seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
-                    num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[
-                        :num_reqs
-                    ],
-                    num_reqs=num_reqs,
-                    num_actual_tokens=num_tokens,
-                    max_query_len=max_query_len,
-                    max_seq_len=self.max_model_len,
-                    block_table_tensor=self.input_batch.block_table[
-                        kv_cache_group_id
-                    ].get_device_tensor(num_reqs),
-                    slot_mapping=self.input_batch.block_table[
-                        kv_cache_group_id
-                    ].slot_mapping.gpu[:num_tokens],
-                    causal=True,
-                    dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs]
-                    if self.dcp_world_size > 1
-                    else None,
-                )
-                for attn_group in self.attn_groups[kv_cache_group_id]:
-                    if ubatch_slices is not None:
-                        common_attn_metadata_list = split_attn_metadata(
-                            ubatch_slices, common_attn_metadata
-                        )
-                        for ubid, common_attn_metadata in enumerate(
-                            common_attn_metadata_list
-                        ):
-                            assert common_attn_metadata.max_query_len == 1
-                            attn_metadata_i = attn_group.get_metadata_builder(
-                                ubatch_id=ubid
-                            ).build_for_cudagraph_capture(common_attn_metadata)
-                            for layer_name in attn_group.layer_names:
-                                assert type(attn_metadata) is list
-                                attn_metadata[ubid][layer_name] = attn_metadata_i
-                    else:
-                        assert type(attn_metadata) is dict
-                        metadata_builder = attn_group.get_metadata_builder()
-                        attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
-                            common_attn_metadata
-                        )
-                        for layer_name in attn_group.layer_names:
-                            attn_metadata[layer_name] = attn_metadata_i
+            attn_metadata, _ = self._build_attention_metadata(
+                total_num_scheduled_tokens=num_tokens,
+                max_num_scheduled_tokens=max_query_len,
+                num_reqs=num_reqs,
+                ubatch_slices=ubatch_slices,
+                for_cudagraph_capture=True,
+            )
 
         with self.maybe_dummy_run_with_lora(
-            self.lora_config, num_scheduled_tokens, activate_lora, remove_lora
+            self.lora_config,
+            num_scheduled_tokens,
+            num_sampled_tokens,
+            activate_lora,
+            remove_lora,
         ):
             # Make sure padding doesn't exceed max_num_tokens
             assert num_tokens_after_padding <= self.max_num_tokens
@@ -3553,7 +3653,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
                     and not self.speculative_config.enforce_eager
                 )
-                self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)
+
+                # Note(gnovack) - We need to disable cudagraphs for one of the two
+                # lora cases when cudagraph_specialize_lora is enabled. This is a
+                # short term mitigation for issue mentioned in
+                # https://github.com/vllm-project/vllm/issues/28334
+                if self.compilation_config.cudagraph_specialize_lora and activate_lora:
+                    use_cudagraphs = False
+
+                self.drafter.dummy_run(
+                    num_tokens,
+                    use_cudagraphs=use_cudagraphs,
+                )
 
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real
@@ -4039,16 +4150,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         ) -> list[AttentionGroup]:
             attn_groups: list[AttentionGroup] = []
             for (attn_backend, kv_cache_spec), layer_names in attn_backends_map.items():
-                attn_group = AttentionGroup.create_with_metadata_builders(
+                attn_group = AttentionGroup(
                     attn_backend,
                     layer_names,
                     kv_cache_spec,
-                    self.vllm_config,
-                    self.device,
                     kv_cache_group_id,
-                    num_metadata_builders=1
-                    if not self.parallel_config.enable_dbo
-                    else 2,
                 )
 
                 attn_groups.append(attn_group)
@@ -4067,7 +4173,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for i, attn_backend_map in enumerate(attention_backend_maps):
             self.attn_groups.append(create_attn_groups(attn_backend_map, i))
 
+    def initialize_metadata_builders(
+        self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int]
+    ) -> None:
+        """
+        Create the metadata builders for all KV cache groups and attn groups.
+        """
+        for kv_cache_group_id in range(len(kv_cache_config.kv_cache_groups)):
+            for attn_group in self.attn_groups[kv_cache_group_id]:
+                attn_group.create_metadata_builders(
+                    self.vllm_config,
+                    self.device,
+                    kernel_block_sizes[kv_cache_group_id]
+                    if kv_cache_group_id < len(kernel_block_sizes)
+                    else None,
+                    num_metadata_builders=1
+                    if not self.parallel_config.enable_dbo
+                    else 2,
+                )
         # Calculate reorder batch threshold (if needed)
+        # Note (tdoublep): do this *after* constructing builders,
+        # because some of them change the threshold at init time.
         self.calculate_reorder_batch_threshold()
 
     def _check_and_update_cudagraph_mode(
@@ -4389,9 +4515,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             list[int]: List of kernel block sizes for each cache group.
         """
         kernel_block_sizes = []
-        for kv_cache_group_id, kv_cache_group in enumerate(
-            kv_cache_config.kv_cache_groups
-        ):
+        for kv_cache_gid, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group.kv_cache_spec
             if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
                 # All layers in the UniformTypeKVCacheSpecs have the same type,
@@ -4403,7 +4527,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # This is an attention backend that supports virtual
                 # block splitting. Get the supported block sizes from
                 # all backends in the group.
-                attn_groups = self.attn_groups[kv_cache_group_id]
+                attn_groups = self.attn_groups[kv_cache_gid]
                 kv_manager_block_size = kv_cache_group.kv_cache_spec.block_size
                 selected_kernel_size = self.select_common_block_size(
                     kv_manager_block_size, attn_groups
@@ -4633,6 +4757,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # kernel_block_size 64 and split the 256-token-block to 4 blocks with 64
         # tokens each.
         kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config)
+
+        # create metadata builders
+        self.initialize_metadata_builders(kv_cache_config, kernel_block_sizes)
+
         # Reinitialize need to after initialize_attn_backend
         self.may_reinitialize_input_batch(kv_cache_config, kernel_block_sizes)
         kv_caches = self.initialize_kv_cache_tensors(
@@ -4651,10 +4779,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            layer_names = self.attn_groups[0][0].layer_names
-            layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, layer_names
-            )
+            layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
             for layer in layers.values():
                 assert layer.impl.need_to_return_lse_for_decode, (
                     "DCP requires attention impls to return"
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index c2bf1419bebd7..19061fcffdf1a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -32,8 +32,10 @@ from vllm.distributed.parallel_state import (
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
+from vllm.profiler.gpu_profiler import CudaProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
@@ -115,6 +117,8 @@ class Worker(WorkerBase):
                     torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
                 ),
             )
+        elif envs.VLLM_TORCH_CUDA_PROFILE:
+            self.profiler = CudaProfilerWrapper()
         else:
             self.profiler = None
 
@@ -380,9 +384,7 @@ class Worker(WorkerBase):
         # NOTE(Kuntai): This need to be done before `initialize_kv_cache`,
         # because `initialize_kv_cache` will inject kv cache groups not
         # related to kv cache connector (e.g. kv cache sharing layers).
-        connector_vllm_config = copy.copy(self.vllm_config)
-        connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)
-        ensure_kv_transfer_initialized(connector_vllm_config)
+        ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
 
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator
@@ -510,9 +512,22 @@ class Worker(WorkerBase):
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.model_runner.get_supported_tasks()
 
+    def annotate_profile(self, scheduler_output):
+        # add trace annotation so that we can easily distinguish
+        # new/cached request numbers in each iteration
+        if not self.profiler:
+            return nullcontext()
+
+        num_new = len(scheduler_output.scheduled_new_reqs)
+        num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
+
+        return torch.profiler.record_function(
+            f"execute_new_{num_new}_cached_{num_cached}"
+        )
+
     @torch.inference_mode()
     def sample_tokens(
-        self, grammar_output: "GrammarOutput"
+        self, grammar_output: "GrammarOutput | None"
     ) -> ModelRunnerOutput | AsyncModelRunnerOutput:
         return self.model_runner.sample_tokens(grammar_output)
 
@@ -537,9 +552,12 @@ class Worker(WorkerBase):
                 )
             )
 
-        output = self.model_runner.execute_model(scheduler_output, intermediate_tensors)
-        if isinstance(output, (ModelRunnerOutput, NoneType)):
-            return output
+        with self.annotate_profile(scheduler_output):
+            output = self.model_runner.execute_model(
+                scheduler_output, intermediate_tensors
+            )
+            if isinstance(output, (ModelRunnerOutput, NoneType)):
+                return output
 
         assert isinstance(output, IntermediateTensors)
         parallel_config = self.vllm_config.parallel_config
@@ -578,7 +596,10 @@ class Worker(WorkerBase):
         else:
             self.profiler.stop()
             # only print profiler results on rank 0
-            if self.local_rank == 0:
+            if (
+                isinstance(self.profiler, torch.profiler.profile)
+                and self.local_rank == 0
+            ):
                 print(
                     self.profiler.key_averages().table(sort_by="self_cuda_time_total")
                 )
@@ -615,7 +636,6 @@ class Worker(WorkerBase):
         }
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
-            self.model_runner.model,
             execute_shuffle=True,
             global_expert_load=None,
             rank_mapping=rank_mapping,
@@ -628,7 +648,7 @@ class Worker(WorkerBase):
         self,
         old_ep_size: int,
         new_ep_size: int,
-        global_expert_load: torch.Tensor | None,
+        global_expert_loads: list[torch.Tensor] | None,
     ) -> None:
         from vllm.distributed.parallel_state import get_ep_group
 
@@ -637,9 +657,8 @@ class Worker(WorkerBase):
         rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
-            self.model_runner.model,
             execute_shuffle=True,
-            global_expert_load=global_expert_load,
+            global_expert_loads=global_expert_loads,
             rank_mapping=rank_mapping,
         )
         if get_ep_group().rank == 0:
@@ -686,31 +705,56 @@ class Worker(WorkerBase):
             get_ep_group,
             prepare_communication_buffer_for_model,
         )
-        from vllm.model_executor.layers.fused_moe.layer import FusedMoEParallelConfig
+        from vllm.model_executor.layers.fused_moe.layer import (
+            FusedMoE,
+            FusedMoEParallelConfig,
+        )
 
         parallel_config = self.vllm_config.parallel_config
-        moe_modules = [
-            module
-            for module in self.model_runner.model.modules()
-            if (
-                module.__class__.__name__ == "FusedMoE"
-                or module.__class__.__name__ == "SharedFusedMoE"
-            )
-        ]
-        num_local_experts = moe_modules[0].moe_config.num_local_experts
-        assert all(
-            module.moe_config.num_local_experts == num_local_experts
-            for module in moe_modules
-        ), "All MoE modules must have the same number of experts"
-        for module in moe_modules:
-            module.moe_config.num_experts = num_local_experts * new_ep_size
-            module.global_num_experts = module.moe_config.num_experts
-            module.moe_parallel_config = FusedMoEParallelConfig.make(
-                tp_size_=get_tp_group().world_size,
-                dp_size_=get_dp_group().world_size,
-                vllm_parallel_config=parallel_config,
-            )
-            module.moe_config.moe_parallel_config = module.moe_parallel_config
+
+        def get_moe_modules(model: torch.nn.Module) -> list[FusedMoE]:
+            return [
+                module
+                for module in model.modules()
+                if (
+                    module.__class__.__name__ == "FusedMoE"
+                    or module.__class__.__name__ == "SharedFusedMoE"
+                )
+            ]
+
+        def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
+            assert all(
+                module.moe_config.num_local_experts == num_local_experts
+                for module in moe_modules
+            ), "All MoE modules must have the same number of experts"
+            for module in moe_modules:
+                module.moe_config.num_experts = num_local_experts * new_ep_size
+                module.global_num_experts = module.moe_config.num_experts
+                module.moe_parallel_config = FusedMoEParallelConfig.make(
+                    tp_size_=get_tp_group().world_size,
+                    dp_size_=get_dp_group().world_size,
+                    vllm_parallel_config=parallel_config,
+                )
+                module.moe_config.moe_parallel_config = module.moe_parallel_config
+            return moe_modules
+
+        model_moe_modules = get_moe_modules(self.model_runner.model)
+        num_local_experts = model_moe_modules[0].moe_config.num_local_experts
+
+        update_moe_modules(model_moe_modules, num_local_experts)
+        drafter_model = None
+        if hasattr(self.model_runner, "drafter") and hasattr(
+            self.model_runner.drafter, "model"
+        ):
+            drafter_model = self.model_runner.drafter.model
+        if drafter_model is not None and is_mixture_of_experts(drafter_model):
+            drafter_moe_modules = get_moe_modules(drafter_model)
+            # Check if drafter and model have matching configs
+            assert (
+                drafter_moe_modules[0].moe_config.num_local_experts == num_local_experts
+            ), "Drafter and model configs should be the same"
+            update_moe_modules(drafter_moe_modules, num_local_experts)
+
         if new_ep_size < old_ep_size:
             num_local_physical_experts = num_local_experts
             assert self.model_runner.eplb_state is not None
@@ -721,7 +765,7 @@ class Worker(WorkerBase):
                 new_physical_experts
                 - self.model_runner.eplb_state.logical_replica_count.shape[1]
             )
-            global_expert_load = None
+            global_expert_loads = None
         else:
             num_local_physical_experts = torch.tensor(
                 [num_local_experts], dtype=torch.int32, device="cpu"
@@ -732,18 +776,20 @@ class Worker(WorkerBase):
             num_local_physical_experts = num_local_physical_experts.item()
             new_physical_experts = num_local_physical_experts * new_ep_size
             assert self.model_runner.eplb_state is not None
-            global_expert_load = self.model_runner.eplb_state.rearrange(
-                self.model_runner.model, execute_shuffle=False
+            global_expert_loads = self.model_runner.eplb_state.rearrange(
+                execute_shuffle=False
             )
             parallel_config.eplb_config.num_redundant_experts = (
-                new_physical_experts - global_expert_load.shape[1]
+                new_physical_experts - global_expert_loads[0].shape[1]
             )
         prepare_communication_buffer_for_model(self.model_runner.model)
+        if drafter_model is not None:
+            prepare_communication_buffer_for_model(drafter_model)
         self.model_runner.model.update_physical_experts_metadata(
             num_physical_experts=new_physical_experts,
             num_local_physical_experts=num_local_physical_experts,
         )
-        return global_expert_load
+        return global_expert_loads
 
     def reinitialize_distributed(
         self, reconfig_request: ReconfigureDistributedRequest
@@ -784,11 +830,11 @@ class Worker(WorkerBase):
                 self.local_rank,
             )
 
-        global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size)
+        global_expert_loads = self._reconfigure_moe(old_ep_size, new_ep_size)
 
         if new_ep_size > old_ep_size:
-            assert global_expert_load is not None
-            self._eplb_after_scale_up(old_ep_size, new_ep_size, global_expert_load)
+            assert global_expert_loads is not None
+            self._eplb_after_scale_up(old_ep_size, new_ep_size, global_expert_loads)
 
     def save_sharded_state(
         self,
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 372bc0a056731..37abe56494609 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -38,7 +38,6 @@ class LoRAModelRunnerMixin:
                 "Regarding multimodal models, vLLM currently "
                 "only supports adding LoRA to language model."
             )
-
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
             vllm_config,
@@ -70,13 +69,19 @@ class LoRAModelRunnerMixin:
             raise RuntimeError("LoRA is not enabled. Use --enable-lora to enable LoRA.")
 
     def set_active_loras(
-        self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray
+        self,
+        input_batch: InputBatch,
+        num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray | None = None,
     ) -> None:
-        prompt_lora_mapping: tuple[int, ...]  # of size input_batch.num_reqs
+        if num_sampled_tokens is None:
+            num_sampled_tokens = np.ones_like(num_scheduled_tokens, dtype=np.int32)
+
+        prompt_lora_mapping: tuple[int, ...]  # of size np.sum(num_sampled_tokens)
         token_lora_mapping: tuple[int, ...]  # of size np.sum(num_scheduled_tokens)
         lora_requests: set[LoRARequest]
         prompt_lora_mapping, token_lora_mapping, lora_requests = (
-            input_batch.make_lora_inputs(num_scheduled_tokens)
+            input_batch.make_lora_inputs(num_scheduled_tokens, num_sampled_tokens)
         )
         return self._set_active_loras(
             prompt_lora_mapping, token_lora_mapping, lora_requests
@@ -123,8 +128,12 @@ class LoRAModelRunnerMixin:
         self,
         lora_config: LoRAConfig | None,
         num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray | None = None,
         activate_lora: bool = True,
     ):
+        if num_sampled_tokens is None:
+            num_sampled_tokens = np.ones_like(num_scheduled_tokens, dtype=np.int32)
+
         if lora_config is None:
             yield
         else:
@@ -143,6 +152,9 @@ class LoRAModelRunnerMixin:
             else:
                 prompt_lora_mapping = np.zeros(num_reqs, dtype=np.int32)
 
+            # Make sample lora mapping
+            sample_lora_mapping = np.repeat(prompt_lora_mapping, num_sampled_tokens)
+
             # Make token lora mapping
             token_lora_mapping = np.repeat(prompt_lora_mapping, num_scheduled_tokens)
 
@@ -157,7 +169,7 @@ class LoRAModelRunnerMixin:
             }
 
             self._set_active_loras(
-                tuple(prompt_lora_mapping), tuple(token_lora_mapping), lora_requests
+                tuple(sample_lora_mapping), tuple(token_lora_mapping), lora_requests
             )
 
             yield
@@ -167,13 +179,14 @@ class LoRAModelRunnerMixin:
         self,
         lora_config: LoRAConfig | None,
         num_scheduled_tokens: np.ndarray,
+        num_sampled_tokens: np.ndarray,
         activate_lora: bool = True,
         remove_lora: bool = True,
     ):
         with (
             self.maybe_setup_dummy_loras(lora_config, remove_lora),
             self.maybe_select_dummy_loras(
-                lora_config, num_scheduled_tokens, activate_lora
+                lora_config, num_scheduled_tokens, num_sampled_tokens, activate_lora
             ),
         ):
             yield
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index d3fb17054c1a7..6bf4f91931849 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -526,7 +526,7 @@ class InputBatch:
         return prompt_token_ids_cpu_tensor.to(device=self.device, non_blocking=True)
 
     def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
+        self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
     ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0e34504a5e268..26816ce0f2091 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -952,6 +952,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             merge_by_field_config=model.merge_by_field_config,
+            multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -2037,6 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 device=self.device,
                 pin_memory=self.pin_memory,
                 merge_by_field_config=model.merge_by_field_config,
+                multimodal_cpu_fields=model.multimodal_cpu_fields,
             )
         )
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 396adbcfb289f..0ca7e81a5c7b8 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 import torch
@@ -134,31 +134,37 @@ class MultiModalBudget:
 @dataclass
 class AttentionGroup:
     backend: type[AttentionBackend]
-    # When ubatching is enabled we will have a metadata builder for each ubatch
-    # so that if they use internal persistant buffers for cudagraphs, and they
-    # won't have to worry about conflicting with the other ubatches.
-    metadata_builders: list[AttentionMetadataBuilder]
     layer_names: list[str]
     kv_cache_spec: KVCacheSpec
     kv_cache_group_id: int
+    # When ubatching is enabled we will have a metadata builder for each ubatch
+    # so that if they use internal persistant buffers for cudagraphs, and they
+    # won't have to worry about conflicting with the other ubatches.
+    metadata_builders: list[AttentionMetadataBuilder] = field(
+        default_factory=lambda: []
+    )
 
-    @staticmethod
-    def create_with_metadata_builders(
-        backend: type[AttentionBackend],
-        layer_names: list[str],
-        kv_cache_spec: KVCacheSpec,
-        vllm_config: VllmConfig,
-        device: torch.device,
-        kv_cache_group_id: int,
+    def create_metadata_builders(
+        self,
+        vllm_config,
+        device,
+        kernel_block_size: int | None,
         num_metadata_builders: int = 1,
-    ) -> "AttentionGroup":
-        metadata_builders = [
-            backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, device)
+    ):
+        kv_cache_spec_builder = (
+            self.kv_cache_spec.copy_with_new_block_size(kernel_block_size)
+            if kernel_block_size is not None
+            else self.kv_cache_spec
+        )
+        self.metadata_builders = [
+            self.backend.get_builder_cls()(
+                kv_cache_spec_builder,
+                self.layer_names,
+                vllm_config,
+                device,
+            )
             for _ in range(num_metadata_builders)
         ]
-        return AttentionGroup(
-            backend, metadata_builders, layer_names, kv_cache_spec, kv_cache_group_id
-        )
 
     def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder:
         assert len(self.metadata_builders) > ubatch_id