diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 12f730738b8a5..38c400ba1faf5 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -132,7 +132,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 58fd435691f4a..0e5b21ddf25b3 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 
-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 
-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s lora"* ]]; then
   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7927aef19e4eb..7479c43977d78 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,6 +49,7 @@ function cpu_tests() {
   # Run kernel tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
@@ -76,7 +77,7 @@ function cpu_tests() {
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+  #   pytest -x -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
   # Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bb5ef5d624630..5fd048c2ad0c6 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -348,6 +348,7 @@ steps:
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a0d2076199b14..be1b79ddc4324 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
 
 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -56,7 +57,7 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
   timeout_in_minutes: 10
   source_file_dependencies:
   - vllm/
@@ -65,6 +66,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
+  - tests/config
   no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
@@ -72,6 +74,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -329,6 +332,7 @@ steps:
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -441,6 +445,7 @@ steps:
     - vllm/
     - tests/compile
   commands:
+    - pytest -v -s compile/test_config.py
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
     - pytest -v -s compile/test_fusion_attn.py
@@ -450,6 +455,7 @@ steps:
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
     - pytest -v -s compile/test_aot_compile.py
+    - pytest -v -s compile/test_qk_norm_rope_fusion.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -604,6 +610,7 @@ steps:
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  autorun_on_main: true
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
@@ -867,12 +874,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -890,11 +897,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -932,7 +944,7 @@ steps:
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml 
+    # Wrap with quotes to escape yaml
     - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
 
 - label: Blackwell Fusion E2E Tests # 30 min
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 23def076cf880..f26c782bccf2c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -61,6 +61,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 
+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
+
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 18d4a2e83144b..997a40e18e588 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -151,6 +151,23 @@ pull_request_rules:
       add:
         - gpt-oss
 
+- name: label-nvidia
+  description: Automatically apply nvidia label
+  conditions:
+    - label != stale
+    - or:
+      - files~=cuda
+      - files~=cutlass
+      - files~=flashinfer
+      - files~=trtllm
+      - title~=(?i)NVIDIA
+      - title~=(?i)CUDA
+      - title~=(?i)CUTLASS
+  actions:
+    label:
+      add:
+        - nvidia
+
 - name: label-rocm
   description: Automatically apply rocm label
   conditions:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e9fa63b178ea..dcc44be87e557 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 
+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+  set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
@@ -237,10 +244,27 @@ set_gencode_flags_for_srcs(
   SRCS "${VLLM_CUMEM_EXT_SRC}"
   CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # link against cuda driver library
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library. Prefer an absolute path to
+    # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+    # back to linking by name "amdhip64".
+    find_library(AMDHIP64_LIB
+      NAMES amdhip64 libamdhip64.so
+      PATHS ${ROCM_PATH}/lib
+      NO_DEFAULT_PATH)
+    if(AMDHIP64_LIB)
+      message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+      list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+    else()
+      message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+      list(APPEND CUMEM_LIBS amdhip64)
+    endif()
+  endif()
   define_extension_target(
     cumem_allocator
     DESTINATION vllm
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
   "csrc/layernorm_quant_kernels.cu"
   "csrc/sampler.cu"
   "csrc/cuda_view.cu"
@@ -330,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -914,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
diff --git a/README.md b/README.md
index b5e230e4b9b07..033e1035d8916 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index f1e504499eaf6..11e3ac7f0c1fa 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
+
+# Disable DeepGEMM for this benchmark to use CUTLASS
+os.environ["VLLM_USE_DEEP_GEMM"] = "0"
+
 import torch
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear,
+    W8A8BlockFp8LinearOp,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    # Create random FP8 tensors
+    # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
     A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
 
+    # Create quantized weight tensor
     B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
     B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
-    # Create scales
+    # Create weight scales
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
         * factor_for_scale
     )
 
-    # SM90 CUTLASS requires row-major format for scales
-    if use_cutlass and current_platform.is_device_capability(90):
-        Bs = Bs.T.contiguous()
+    # Create W8A8BlockFp8LinearOp instance
+    weight_group_shape = GroupShape(block_n, block_k)
+    act_quant_group_shape = GroupShape(1, block_k)  # Per-token, per-group quantization
+
+    linear_op = W8A8BlockFp8LinearOp(
+        weight_group_shape=weight_group_shape,
+        act_quant_group_shape=act_quant_group_shape,
+        cutlass_block_fp8_supported=use_cutlass,
+        use_aiter_and_is_supported=False,
+    )
 
     def run():
-        if use_cutlass:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
-            )
-        else:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
-            )
+        return linear_op.apply(
+            input=A_ref,
+            weight=B,
+            weight_scale=Bs,
+            input_scale=None,
+            bias=None,
+        )
 
     return run
 
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 2674899d1cc56..8cb8a2f386a97 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -11,6 +11,7 @@ from bench_utils import (
     Color,
     logger,
 )
+from tqdm import tqdm
 from transformers import AutoTokenizer  # type: ignore
 
 # Conversation ID is a string (e.g: "UzTK34D")
@@ -417,6 +418,10 @@ def generate_conversations(
             data = file.read()
             tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
             list_of_tokens.extend(tokens_in_file)
+        logger.info(
+            f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+            f"total tokens so far: {len(list_of_tokens)}"
+        )
 
     conversations: ConversationsMap = {}
     conv_id = 0
@@ -449,18 +454,25 @@ def generate_conversations(
         )
         base_offset += common_prefix_tokens
 
-    for conv_id in range(args.num_conversations):
+    for conv_id in tqdm(
+        range(args.num_conversations),
+        total=args.num_conversations,
+        desc="Generating conversations",
+        unit="conv",
+    ):
         # Generate a single conversation
         messages: MessagesList = []
 
         nturns = turn_count[conv_id]
 
         # User prompt token count per turn (with lower limit)
-        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
         input_token_count = np.maximum(input_token_count, base_prompt_token_count)
 
         # Assistant answer token count per turn (with lower limit)
-        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+            int
+        )
         output_token_count = np.maximum(output_token_count, 1)
 
         user_turn = True
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 5d2ac66e5ab94..ae9e9753441aa 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -55,6 +55,7 @@ class ClientArgs(NamedTuple):
     verify_output: bool
     conversation_sampling: ConversationSampling
     request_rate: float
+    max_retries: int
 
 
 class RequestArgs(NamedTuple):
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
     stream: bool
     limit_min_tokens: int  # Use negative value for no limit
     limit_max_tokens: int  # Use negative value for no limit
+    timeout_sec: int
 
 
 class BenchmarkArgs(NamedTuple):
@@ -214,6 +216,7 @@ async def send_request(
     stream: bool = True,
     min_tokens: int | None = None,
     max_tokens: int | None = None,
+    timeout_sec: int = 120,
 ) -> ServerResponse:
     payload = {
         "model": model,
@@ -235,10 +238,16 @@ async def send_request(
     headers = {"Content-Type": "application/json"}
 
     # Calculate the timeout for the request
-    timeout_sec = 120
     if max_tokens is not None:
         # Assume TPOT of 200ms and use max_tokens to determine timeout
-        timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+        token_based_timeout = int(max_tokens * 0.2)
+        if token_based_timeout > timeout_sec:
+            timeout_sec = token_based_timeout
+            logger.info(
+                "Using timeout of %ds based on max_tokens %d",
+                timeout_sec,
+                max_tokens,
+            )
     timeout = aiohttp.ClientTimeout(total=timeout_sec)
 
     valid_response = True
@@ -409,6 +418,7 @@ async def send_turn(
         req_args.stream,
         min_tokens,
         max_tokens,
+        req_args.timeout_sec,
     )
 
     if response.valid is False:
@@ -518,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
     await asyncio.sleep(interval)
 
 
+async def exponential_backoff_sleep(
+    attempt_cnt: int,
+    base_rate: float = 1.0,
+    backoff_factor: float = 2.0,
+    jitter_fraction: float = 0.10,
+    verbose: bool = False,
+) -> None:
+    # Sleep with exponential backoff and jitter after a failed request.
+    backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+    jittered_delay = backoff_delay * (
+        1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+    )
+
+    if verbose:
+        logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+
+    await asyncio.sleep(jittered_delay)
+
+
 async def client_main(
     args: ClientArgs,
     req_args: RequestArgs,
@@ -646,49 +675,62 @@ async def client_main(
                 )
                 time_of_last_turn[conv_id] = curr_time_sec
 
-            success = True
-            try:
-                result = await send_turn(
-                    session,
-                    client_id,
-                    conv_id,
-                    messages,
-                    current_turn,
-                    tokenizer,
-                    req_args,
-                    args.print_content,
-                    args.verify_output,
-                )
-                if result is not None:
-                    result_queue.put(result)
-                else:
-                    # None means that the request failed,
-                    # and should not be added to the statistics.
-                    success = False
-                    num_failures += 1
-
-                    logger.warning(
-                        f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+            success = False
+            for attempt_cnt in range(args.max_retries + 1):
+                try:
+                    exception = False
+                    result = await send_turn(
+                        session,
+                        client_id,
+                        conv_id,
+                        messages,
+                        current_turn,
+                        tokenizer,
+                        req_args,
+                        args.print_content,
+                        args.verify_output,
+                    )
+                    if result is not None:
+                        result_queue.put(result)
+                        success = True
+                        break
+                    else:
+                        logger.warning(
+                            f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                        )
+                except asyncio.exceptions.TimeoutError:
+                    exception = True
+                    logger.error(
+                        "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+                        "Base timeout is %ss (set with --request-timeout-sec), but the "
+                        "effective timeout may be longer based on max_tokens. If this "
+                        "is unexpected, consider increasing the timeout or checking "
+                        "model performance.%s",
+                        Color.RED,
+                        client_id,
+                        conv_id,
+                        current_turn,
+                        req_args.timeout_sec,
+                        Color.RESET,
+                    )
+                except Exception:
+                    exception = True
+                    logger.exception(
+                        f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
                     )
 
-                    # Remove the conversation (should not be used again)
-                    active_convs.pop(conv_id)
+                # Sleep before retry if not last attempt
+                if not success and attempt_cnt < args.max_retries:
+                    await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
 
-            except asyncio.exceptions.TimeoutError:
+            if not success:
                 num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
+                # Remove the conversation (should not be used again)
+                active_convs.pop(conv_id)
+                if exception:
+                    break  # Exit gracefully instead of raising an error
 
-            except Exception:
-                num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
-
-            if success:
+            else:
                 num_successes += 1
 
                 # Update the turns counter to include the LLM response
@@ -803,6 +845,7 @@ def get_client_config(
         verify_output=args.verify_output,
         conversation_sampling=args.conversation_sampling,
         request_rate=args.request_rate,
+        max_retries=args.max_retries,
     )
 
     if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
@@ -815,6 +858,9 @@ def get_client_config(
                 "Invalid min/max tokens limits (min should not be larger than max)"
             )
 
+    if args.request_timeout_sec <= 0:
+        raise ValueError("Request timeout must be a positive number")
+
     # Arguments for API requests
     chat_url = f"{args.url}/v1/chat/completions"
     model_name = args.served_model_name if args.served_model_name else args.model
@@ -825,6 +871,7 @@ def get_client_config(
         stream=not args.no_stream,
         limit_min_tokens=args.limit_min_tokens,
         limit_max_tokens=args.limit_max_tokens,
+        timeout_sec=args.request_timeout_sec,
     )
 
     return client_args, req_args
@@ -968,7 +1015,7 @@ async def main_mp(
             f"(is alive: {client.is_alive()}){Color.RESET}"
         )
 
-        client.join(timeout=120)
+        client.join(timeout=req_args.timeout_sec + 1)
 
         if client.is_alive():
             logger.warning(
@@ -1334,6 +1381,16 @@ async def main() -> None:
         help="Expected request rate (Poisson process) per client in requests/sec."
         "Set to 0 for no delay between requests.",
     )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+        help="Maximum number of retry attempts for timed-out requests. "
+        "Default is 0 (no retries). "
+        "Set to higher values to retry failed requests and maintain "
+        "fair workload distribution. "
+        "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+    )
     parser.add_argument(
         "--conversation-sampling",
         type=ConversationSampling,
@@ -1351,6 +1408,13 @@ async def main() -> None:
         action="store_true",
         help="Verify the LLM output (compare to the answers in the input JSON file)",
     )
+    parser.add_argument(
+        "--request-timeout-sec",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each API request (default: 120). "
+        "Automatically increased if max tokens imply longer decoding.",
+    )
 
     parser.add_argument(
         "--no-stream",
diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt
index f0e1935914a14..bae656a5c5c4b 100644
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
@@ -2,4 +2,5 @@ numpy>=1.24
 pandas>=2.0.0
 aiohttp>=3.10
 transformers>=4.46
-xlsxwriter>=3.2.1
\ No newline at end of file
+xlsxwriter>=3.2.1
+tqdm>=4.66
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index dbda19fbcbf20..bb0179c79c108 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -15,6 +15,7 @@ endif()
 #
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         set(ENABLE_AVX512VNNI OFF)
         message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
     endif()
+
+    find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
+    if (AMXBF16_FOUND OR ENABLE_AMXBF16)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
+            set(ENABLE_AMXBF16 ON)
+            add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
+        else()
+            set(ENABLE_AMXBF16 OFF)
+            message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AMXBF16 OFF)
+        message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
+    endif()
     
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -193,7 +210,30 @@ endif()
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
     # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
     if(ASIMD_FOUND)
+        # Set number of parallel build processes
+        include(ProcessorCount)
+        ProcessorCount(NPROC)
+        if(NOT NPROC)
+            set(NPROC 4)
+        endif()
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+        if (OPEN_MP)
+            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+        endif()
+
+        # Fetch and populate ACL
         if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
             message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
         else()
@@ -207,38 +247,53 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                 GIT_PROGRESS   TRUE
             )
             set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
         endif()
 
-        # Build ACL with scons
-        include(ProcessorCount)
-        ProcessorCount(_NPROC)
-        set(_scons_cmd
-        scons -j${_NPROC}
-            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
-            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
-            multi_isa=1 openmp=1 cppthreads=0
+        # Build ACL with CMake
+        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
+        set(CMAKE_BUILD_TYPE "Release")
+        set(ARM_COMPUTE_ARCH "armv8.2-a")
+        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
+        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
+        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
+        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
+        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
+        set(ARM_COMPUTE_BUILD_TESTING "OFF")
+
+        set(_cmake_config_cmd
+             ${CMAKE_COMMAND} -G Ninja -B build 
+            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
+            -DCMAKE_BUILD_TYPE=Release 
+            -DARM_COMPUTE_ARCH=armv8.2-a 
+            -DARM_COMPUTE_ENABLE_ASSERTS=OFF 
+            -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF 
+            -DARM_COMPUTE_ENABLE_OPENMP=ON 
+            -DARM_COMPUTE_ENABLE_WERROR=OFF 
+            -DARM_COMPUTE_BUILD_EXAMPLES=OFF 
+            -DARM_COMPUTE_BUILD_TESTING=OFF)
+        set(_cmake_build_cmd
+            ${CMAKE_COMMAND} --build build -- -j${NPROC}
         )
 
-        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-        # and create a local shim dir with it
-        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
-        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
-        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
-            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
-        endif()
-
         execute_process(
-            COMMAND ${_scons_cmd}
+            COMMAND ${_cmake_config_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+        )
+        execute_process(
+            COMMAND ${_cmake_build_cmd}
             WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
             RESULT_VARIABLE _acl_rc
         )
+
         if(NOT _acl_rc EQUAL 0)
             message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
         endif()
+        message(STATUS "Arm Compute Library (ACL) built successfully.")
 
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
         add_compile_definitions(VLLM_USE_ACL)
     endif()
 
@@ -275,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
     set(ONEDNN_VERBOSE "OFF")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
+    set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
+    set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
     add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
     target_include_directories(
         dnnl_ext
@@ -305,14 +363,14 @@ endif()
 #
 set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
-    "csrc/cpu/attention.cpp"
-    "csrc/cpu/cache.cpp"
     "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/mla_decode.cpp"
     "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
-    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+    "csrc/cpu/cpu_attn.cpp"
+    "csrc/cpu/scratchpad_manager.cpp"
+    "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
deleted file mode 100644
index 82862fea7f2be..0000000000000
--- a/csrc/cpu/attention.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-#include "cpu_types.hpp"
-
-namespace {
-
-template <typename scalar_t>
-struct KernelVecType {
-  using q_load_vec_type = void;
-  using q_vec_type = void;
-  using k_load_vec_type = void;
-  using k_vec_type = void;
-  using qk_acc_vec_type = void;
-  using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using q_load_vec_type = vec_op::FP32Vec4;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power and s390x architecture-specific vector types
-  using q_load_vec_type = vec_op::FP32Vec8;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures, including x86
-  using q_load_vec_type = vec_op::FP16Vec8;
-  using k_load_vec_type = vec_op::FP16Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
-#endif
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::BF16Vec32;
-  using k_load_vec_type = vec_op::BF16Vec32;
-  using k_vec_type = vec_op::BF16Vec32;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#else
-  #ifdef __aarch64__
-    #ifndef ARM_BF16_SUPPORT
-    // pass
-    #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-    #endif
-  #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-  #endif
-#endif
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
-                                           const int capacity) {
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= data[i] ? max : data[i];
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
-                                                const int capacity,
-                                                const float alibi_slope,
-                                                const int start_index,
-                                                const int seq_len) {
-  data[0] += alibi_slope * (start_index - seq_len + 1);
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
-    data[i] = qk;
-    max = max >= qk ? max : qk;
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
-                                         const int size) {
-  T max = max_data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= max_data[i] ? max : max_data[i];
-  }
-
-  T rescaled_sum = 0;
-  for (int i = 0; i < size; ++i) {
-    T rescale_factor = std::exp(max_data[i] - max);
-    rescaled_sum += rescale_factor * sum_data[i];
-    sum_data[i] *= rescale_factor;
-  }
-  for (int i = 0; i < size; ++i) {
-    sum_data[i] /= rescaled_sum + 1e-8;
-  }
-}
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
-struct reduceQKBlockKernel {
-  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
-  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
-  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
-  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
-  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
-
-  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
-  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
-  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
-
-  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
-  static_assert(k_load_vec_type::get_elem_num() % x == 0);
-  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
-
-  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
-                                const scalar_t* __restrict__ k_block,
-                                float* __restrict__ logits, float scale,
-                                const int token_num) {
-    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
-
-    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
-    if (token_num == BLOCK_SIZE) {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-
-        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
-            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
-              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                             TOKEN_PER_GROUP);
-              k_vec_type k_group_vec(k_load_group_vec);
-              vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                          k_group_vec);
-              vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                               token_group_idx * x * TOKEN_PER_GROUP);
-            });
-      }
-    } else {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-        for (int token_group_start = 0; token_group_start < group_num;
-             token_group_start += UNROLL_GROUP_NUM) {
-          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
-              [token_group_start, k_block, &q_group_vec,
-               &group_accums](int token_group_idx) {
-                token_group_idx += token_group_start;
-                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                               TOKEN_PER_GROUP);
-                k_vec_type k_group_vec(k_load_group_vec);
-                vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                            k_group_vec);
-                vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                                 token_group_idx * x * TOKEN_PER_GROUP);
-              });
-        }
-      }
-    }
-
-    for (int token_group_idx = 0; token_group_idx < group_num;
-         ++token_group_idx) {
-      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
-          [&group_accums, logits, scale, token_group_idx](int token_idx) {
-            float dot_v =
-                group_accums[token_group_idx]
-                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
-                                             TOKEN_PER_GROUP>(token_idx);
-            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
-                dot_v * scale;
-          });
-    }
-  }
-};
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
-          int HEAD_PARTITION_SIZE, typename acc_t>
-FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
-                                   acc_t&& acc) {
-  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
-  static_assert(BLOCK_SIZE == ELEM_NUM);
-  vec_op::FP32Vec16 prob_vec(prob);
-
-  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
-    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
-    vec_op::FP32Vec16 fp32_v_vec(v_vec);
-    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
-  });
-}
-};  // namespace
-
-// Paged attention v1
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
-struct paged_attention_v1_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-
-    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
-    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
-    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
-
-    const int parallel_work_item_num = omp_get_max_threads();
-
-    size_t logits_bytes =
-        parallel_work_item_num * max_seq_len_padded * sizeof(float);
-    float* logits = (float*)std::aligned_alloc(
-        64, logits_bytes);  // Cacheline alignment for each context token.
-                            // [parallel_work_item_num, max_seq_len_padded]
-
-#pragma omp parallel for collapse(2) schedule(dynamic, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        int seq_len = seq_lens[seq_idx];
-        const int* seq_block_table =
-            block_tables + max_num_blocks_per_seq * seq_idx;
-        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-        const scalar_t* __restrict__ q_vec_ptr =
-            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
-        float* __restrict__ thread_block_logits =
-            logits + omp_get_thread_num() * max_seq_len_padded;
-
-        // Compute logits
-        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-          const int64_t physical_block_idx = seq_block_table[block_idx];
-          const scalar_t* __restrict__ k_block_cache_ptr =
-              k_cache + physical_block_idx * kv_block_stride +
-              kv_head_idx * kv_head_stride;
-          float* __restrict__ head_block_logits =
-              thread_block_logits + block_idx * BLOCK_SIZE;
-
-          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-        }
-
-        // Compute softmax
-        if (alibi_slopes) {
-          reduceSoftmaxAlibi(thread_block_logits, seq_len,
-                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
-                             seq_len);
-        } else {
-          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
-        }
-
-        // Compute value
-        constexpr int head_elem_num_per_partition = 16;
-        constexpr int head_partition_num =
-            HEAD_SIZE / head_elem_num_per_partition;
-        for (int head_part_idx = 0; head_part_idx < head_partition_num;
-             ++head_part_idx) {
-          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-          scalar_t* __restrict__ out_ptr =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              head_part_idx * head_elem_num_per_partition;
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const float* __restrict__ prob_vec_ptr =
-                thread_block_logits + block_idx * BLOCK_SIZE;
-            const scalar_t* __restrict__ v_block_cache_ptr =
-                v_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride +
-                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                             head_elem_num_per_partition>(
-                prob_vec_ptr, v_block_cache_ptr, accums);
-
-            if (block_idx != block_num - 1) {
-              const int64_t next_physical_block_idx =
-                  seq_block_table[block_idx + 1];
-              const scalar_t* __restrict__ next_v_block_cache_ptr =
-                  v_cache + next_physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                  [&](int head_elem_idx) {
-                    if (head_elem_idx % 2 == 0) {
-                      vec_op::prefetch(next_v_block_cache_ptr +
-                                       BLOCK_SIZE * head_elem_idx);
-                    }
-                  });
-            }
-          }
-
-          vec_op::unroll_loop<int, head_elem_num_per_partition>(
-              [&](int head_elem_idx) {
-                float value = accums[head_elem_idx].reduce_sum();
-                vec_op::storeFP32(value, out_ptr + head_elem_idx);
-              });
-        }
-      }
-    }
-    std::free(logits);
-  }
-};
-
-#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
-  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
-      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
-      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
-      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
-      num_heads);
-
-template <typename T, int BLOCK_SIZE>
-void paged_attention_v1_impl_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
-  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes);
-
-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v1(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
-                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
-                               });
-}
-
-// Paged attention v2
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
-struct paged_attention_v2_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      float* __restrict__ max_logits,        // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
-                                             // max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads, const int max_num_partitions) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
-    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
-
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int partition_idx = 0; partition_idx < max_num_partitions;
-           ++partition_idx) {
-        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int start_token_idx = partition_idx * PARTITION_SIZE;
-
-          if (start_token_idx >= seq_len) continue;
-
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-          const bool no_reduce = (partition_num == 1);
-          const int token_num =
-              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
-               start_token_idx);
-          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          const int last_block_token_num =
-              token_num - (block_num - 1) * BLOCK_SIZE;
-          const int* seq_block_table = block_tables +
-                                       max_num_blocks_per_seq * seq_idx +
-                                       start_token_idx / BLOCK_SIZE;
-          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-          const scalar_t* __restrict__ q_vec_ptr =
-              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
-
-          // Compute logits
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const scalar_t* __restrict__ k_block_cache_ptr =
-                k_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride;
-            float* __restrict__ head_block_logits =
-                logits + block_idx * BLOCK_SIZE;
-
-            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-          }
-
-          std::pair<float, float> max_and_sum;
-          if (alibi_slopes) {
-            max_and_sum = reduceSoftmaxAlibi(
-                logits, token_num, block_num * BLOCK_SIZE,
-                alibi_slopes[head_idx], start_token_idx, seq_len);
-          } else {
-            max_and_sum =
-                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
-          }
-
-          auto&& [max_logit, exp_sum] = max_and_sum;
-
-          scalar_t* __restrict__ output_buffer = nullptr;
-          if (!no_reduce) {
-            auto idx = seq_idx * num_heads * max_num_partitions +
-                       head_idx * max_num_partitions + partition_idx;
-            max_logits[idx] = max_logit;
-            exp_sums[idx] = exp_sum;
-            output_buffer =
-                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-                head_idx * max_num_partitions * HEAD_SIZE +
-                partition_idx * HEAD_SIZE;
-          } else {
-            output_buffer =
-                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-          }
-
-          // Compute value
-          constexpr int head_elem_num_per_partition = 16;
-          constexpr int head_partition_num =
-              HEAD_SIZE / head_elem_num_per_partition;
-          for (int head_part_idx = 0; head_part_idx < head_partition_num;
-               ++head_part_idx) {
-            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-            scalar_t* __restrict__ out_ptr =
-                output_buffer + head_part_idx * head_elem_num_per_partition;
-            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-              const int64_t physical_block_idx = seq_block_table[block_idx];
-              const float* __restrict__ prob_vec_ptr =
-                  logits + block_idx * BLOCK_SIZE;
-              const scalar_t* __restrict__ v_block_cache_ptr =
-                  v_cache + physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                               head_elem_num_per_partition>(
-                  prob_vec_ptr, v_block_cache_ptr, accums);
-
-              if (block_idx != block_num - 1) {
-                const int64_t next_physical_block_idx =
-                    seq_block_table[block_idx + 1];
-                const scalar_t* __restrict__ next_v_block_cache_ptr =
-                    v_cache + next_physical_block_idx * kv_block_stride +
-                    kv_head_idx * kv_head_stride +
-                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-                vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                    [&](int head_elem_idx) {
-                      if (head_elem_idx % 2 == 0) {
-                        vec_op::prefetch(next_v_block_cache_ptr +
-                                         BLOCK_SIZE * head_elem_idx);
-                      }
-                    });
-              }
-            }
-
-            vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                [&](int head_elem_idx) {
-                  float value = accums[head_elem_idx].reduce_sum();
-                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
-                });
-          }
-        }
-      }
-    }
-
-    // Rescale partition softmax and store the factors to exp_sums
-#pragma omp parallel for collapse(2) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        const int seq_len = seq_lens[seq_idx];
-        const int partition_num =
-            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-        if (partition_num == 1) continue;
-
-        reducePartitionSoftmax(
-            max_logits + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            exp_sums + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            partition_num);
-      }
-    }
-
-    // Reduce values
-    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
-    constexpr int head_elem_num_per_group =
-        16;  // Note: didn't align with the cacheline size, due to some
-             // HEAD_SIZE didn't align with 64 bytes
-    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
-    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
-    const float* __restrict__ rescale_factors = exp_sums;
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-          if (partition_num == 1) continue;
-
-          const float* __restrict__ seq_head_rescale_factors =
-              rescale_factors + seq_idx * num_heads * max_num_partitions +
-              head_idx * max_num_partitions;
-          const scalar_t* __restrict__ seq_head_tmp_out =
-              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-              head_idx * max_num_partitions * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-          scalar_t* __restrict__ seq_head_output =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-
-          vec_op::FP32Vec16 acc;
-          for (int i = 0; i < partition_num; ++i) {
-            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
-            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
-            vec_op::FP32Vec16 fp32_value(value);
-            acc = acc + fp32_value * rescale_factor;
-          }
-          v_load_vec_type cast_acc(acc);
-          cast_acc.save(seq_head_output);
-        }
-      }
-    }
-  }
-};
-
-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
-  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
-      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
-      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
-      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
-      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
-      max_num_partitions);
-
-template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
-void paged_attention_v2_impl_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-  int max_num_partitions = exp_sums.size(-1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
-  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
-      alibi_slopes);
-
-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v2(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
-                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
-                               });
-}
\ No newline at end of file
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
deleted file mode 100644
index 69f6d06e3c967..0000000000000
--- a/csrc/cpu/cache.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include <map>
-#include <vector>
-
-#include "cpu_types.hpp"
-
-#if defined(__x86_64__)
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
-#else
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
-#endif
-
-namespace {
-template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
-                          const torch::Tensor& mapping_pairs,
-                          const int element_num_per_block,
-                          const int layer_num) {
-  const size_t pair_num = mapping_pairs.size(0);
-  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
-#pragma omp parallel for collapse(2)
-  for (int layer = 0; layer < layer_num; ++layer) {
-    for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset =
-          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
-      int64_t target_offset =
-          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
-      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
-      scalar_t* source_ptr = key_cache_ptr + source_offset;
-      scalar_t* target_ptr = key_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-
-      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
-      source_ptr = value_cache_ptr + source_offset;
-      target_ptr = value_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-    }
-  }
-}
-
-template <typename scalar_t>
-void reshape_and_cache_cpu_impl(
-    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
-    const int64_t* __restrict__ slot_mapping, const int num_tokens,
-    const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x) {
-  const int block_elem_num = num_heads * head_size * block_size;
-
-#pragma omp parallel for collapse(2)
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-      const int64_t slot_idx = slot_mapping[token_idx];
-      if (slot_idx >= 0) {
-        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
-        int src_value_head_idx =
-            token_idx * value_stride + head_idx * head_size;
-        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
-        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
-        const int64_t block_index = slot_idx / block_size;
-        const int64_t block_offset = slot_idx % block_size;
-        scalar_t* target_key_head_ptr = key_cache +
-                                        block_elem_num * block_index +
-                                        head_idx * block_size * head_size;
-        scalar_t* target_value_head_ptr = value_cache +
-                                          block_elem_num * block_index +
-                                          head_idx * block_size * head_size;
-
-        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
-          const int64_t target_offset =
-              src_key_idx * block_size + block_offset * x;
-          for (int i = 0; i < x; ++i) {
-            target_key_head_ptr[target_offset + i] =
-                src_key_head_ptr[src_key_idx + i];
-          }
-        }
-
-        for (int src_value_idx = 0; src_value_idx < head_size;
-             ++src_value_idx) {
-          const int64_t target_offset =
-              src_value_idx * block_size + block_offset;
-          target_value_head_ptr[target_offset] =
-              src_value_head_ptr[src_value_idx];
-        }
-      }
-    }
-  }
-}
-};  // namespace
-
-template <typename scalar_t>
-void concat_and_cache_mla_cpu_impl(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                      // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int num_tokens,                      //
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size                       //
-) {
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    const int64_t slot_idx = slot_mapping[token_idx];
-    // NOTE: slot_idx can be -1 if the token is padded
-    if (slot_idx < 0) {
-      continue;
-    }
-    const int64_t block_idx = slot_idx / block_size;
-    const int64_t block_offset = slot_idx % block_size;
-
-    auto copy = [&](const scalar_t* __restrict__ src,
-                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
-                    int size, int offset) {
-      for (int i = 0; i < size; i++) {
-        const int64_t src_idx = token_idx * src_stride + i;
-        const int64_t dst_idx =
-            block_idx * block_stride + block_offset * entry_stride + i + offset;
-        dst[dst_idx] = src[src_idx];
-      }
-    };
-
-    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-  }
-}
-
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  unsigned num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-
-  const int element_num_per_block = key_caches[0][0].numel();
-  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                   element_num_per_block, num_layers);
-    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-  });
-}
-
-void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
-                       torch::Tensor& key_cache, torch::Tensor& value_cache,
-                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
-  int num_heads = key.size(1);
-  int head_size = key.size(2);
-  int block_size = key_cache.size(3);
-  int x = key_cache.size(4);
-
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-
-  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-    reshape_and_cache_cpu_impl<scalar_t>(
-        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
-        num_heads, head_size, block_size, x);
-    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-  });
-}
-
-void concat_and_cache_mla(
-    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  TORCH_CHECK(kv_cache_dtype != "fp8");
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        concat_and_cache_mla_cpu_impl<scalar_t>(
-            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
-      });
-}
-
-void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
-                 const torch::Tensor& block_mapping) {
-  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
-}
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
new file mode 100644
index 0000000000000..50f17c758c148
--- /dev/null
+++ b/csrc/cpu/cpu_attn.cpp
@@ -0,0 +1,249 @@
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+  #define AMX_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::AMX: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
+                                                     scalar_t, head_dim>;     \
+      return __VA_ARGS__();                                                   \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
+#endif
+
+#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
+  case HEAD_DIM: {                            \
+    constexpr size_t head_dim = HEAD_DIM;     \
+    return __VA_ARGS__();                     \
+  }
+
+#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
+  [&] {                                                         \
+    switch (HEAD_DIM) {                                         \
+      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
+      default: {                                                \
+        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
+                               std::to_string(HEAD_DIM));       \
+      }                                                         \
+    }                                                           \
+  }()
+
+#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
+  [&] {                                                                       \
+    switch (ISA_TYPE) {                                                       \
+      AMX_DISPATCH(__VA_ARGS__)                                               \
+      case cpu_attention::ISA::VEC: {                                         \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      case cpu_attention::ISA::VEC16: {                                       \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      default: {                                                              \
+        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
+      }                                                                       \
+    }                                                                         \
+  }()
+
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split) {
+  cpu_attention::ISA isa;
+  if (isa_hint == "amx") {
+    isa = cpu_attention::ISA::AMX;
+  } else if (isa_hint == "vec") {
+    isa = cpu_attention::ISA::VEC;
+  } else if (isa_hint == "vec16") {
+    isa = cpu_attention::ISA::VEC16;
+  } else {
+    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
+  }
+
+  cpu_attention::AttentionScheduler::ScheduleInput input;
+  input.num_reqs = num_req;
+  input.num_heads_q = num_heads_q;
+  input.num_heads_kv = num_heads_kv;
+  input.head_dim = head_dim;
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  if (window_size != -1) {
+    input.left_sliding_window_size = window_size - 1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = window_size - 1;
+    }
+  } else {
+    input.left_sliding_window_size = -1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = -1;
+    }
+  }
+  input.casual = casual;
+  input.isa = isa;
+  input.enable_kv_split = enable_kv_split;
+  TORCH_CHECK(casual, "Only supports casual mask for now.");
+
+  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
+    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
+        input.elem_size = sizeof(scalar_t);
+        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+        input.output_buffer_elem_size =
+            sizeof(attn_impl::partial_output_buffer_t);
+        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
+      });
+    });
+  });
+
+  cpu_attention::AttentionScheduler scheduler;
+  torch::Tensor metadata = scheduler.schedule(input);
+  return metadata;
+}
+
+void cpu_attn_reshape_and_cache(
+    const torch::Tensor& key,    // [token_num, head_num, head_size]
+    const torch::Tensor& value,  // [token_num, head_num, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor& slot_mapping, const std::string& isa) {
+  TORCH_CHECK_EQ(key.dim(), 3);
+  TORCH_CHECK_EQ(value.dim(), 3);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+  TORCH_CHECK_EQ(key.stride(2), 1);
+  TORCH_CHECK_EQ(value.stride(2), 1);
+
+  const int64_t token_num = key.size(0);
+  const int64_t key_token_num_stride = key.stride(0);
+  const int64_t value_token_num_stride = value.stride(0);
+  const int64_t head_num = value.size(1);
+  const int64_t key_head_num_stride = key.stride(1);
+  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t num_blocks = key_cache.size(0);
+  const int64_t num_blocks_stride = key_cache.stride(0);
+  const int64_t cache_head_num_stride = key_cache.stride(1);
+  const int64_t block_size = key_cache.size(2);
+  const int64_t block_size_stride = key_cache.stride(2);
+  const int64_t head_dim = key.size(-1);
+
+  cpu_attention::ISA isa_tag = [&]() {
+    if (isa == "amx") {
+      return cpu_attention::ISA::AMX;
+    } else if (isa == "vec") {
+      return cpu_attention::ISA::VEC;
+    } else if (isa == "vec16") {
+      return cpu_attention::ISA::VEC16;
+    } else {
+      TORCH_CHECK(false, "Invalid ISA type: " + isa);
+    }
+  }();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
+            attn_impl::reshape_and_cache(
+                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+                key_cache.data_ptr<scalar_t>(),
+                value_cache.data_ptr<scalar_t>(),
+                slot_mapping.data_ptr<int64_t>(), token_num,
+                key_token_num_stride, value_token_num_stride, head_num,
+                key_head_num_stride, value_head_num_stride, num_blocks,
+                num_blocks_stride, cache_head_num_stride, block_size,
+                block_size_stride);
+          });
+        });
+      });
+}
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor&
+        value_cache,        // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor& output,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor& query_start_loc,  // [num_tokens + 1]
+    const torch::Tensor& seq_lens,         // [num_tokens]
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,  // [num_heads]
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table,  // [num_tokens, max_block_num]
+    const double softcap, const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux  // [num_heads]
+) {
+  TORCH_CHECK_EQ(query.dim(), 3);
+  TORCH_CHECK_EQ(query.stride(2), 1);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+
+  cpu_attention::AttentionInput input;
+  input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
+      scheduler_metadata.data_ptr());
+  input.num_tokens = query.size(0);
+  input.num_heads = query.size(1);
+  input.num_kv_heads = key_cache.size(1);
+  input.block_size = key_cache.size(2);
+  input.query = query.data_ptr();
+  input.query_num_tokens_stride = query.stride(0);
+  input.query_num_heads_stride = query.stride(1);
+  input.cache_num_blocks_stride = key_cache.stride(0);
+  input.cache_num_kv_heads_stride = key_cache.stride(1);
+  input.blt_num_tokens_stride = block_table.stride(0);
+  input.key_cache = key_cache.data_ptr();
+  input.value_cache = value_cache.data_ptr();
+  input.output = output.data_ptr();
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  input.block_table = block_table.data_ptr<int32_t>();
+  input.alibi_slopes =
+      alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
+  // For now sink must be bf16
+  input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
+  input.scale = scale;
+  input.causal = causal;
+  input.sliding_window_left = sliding_window_left;
+  input.sliding_window_right = sliding_window_right;
+  if (input.causal) {
+    // to make boundary calculation easier
+    input.sliding_window_right = 0;
+  }
+  float softcap_fp32 = softcap;
+  input.softcap = softcap_fp32;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
+          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
+            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+            mainloop(&input);
+          });
+        });
+      });
+}
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
new file mode 100644
index 0000000000000..8da458b99119c
--- /dev/null
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -0,0 +1,511 @@
+#ifndef CPU_ATTN_AMX_HPP
+#define CPU_ATTN_AMX_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename kv_cache_t>
+class TileGemm224 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    // k_cache, v_cache are prepacked
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    // logits_buffer, output_buffer are not prepacked
+    float* __restrict__ c_tile_4 = c_tile;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename kv_cache_t>
+class TileGemm122 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_tile;
+    float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      }
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = scalar_t;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = scalar_t;
+
+  constexpr static int64_t BlockSizeAlignment =
+      AMX_TILE_ROW_BYTES /
+      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 32;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::AMX;
+  constexpr static bool scale_on_logits = true;
+
+ public:
+  AttentionImpl() : current_q_head_num_(0) {
+    // Use all columns in AMX tiles
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  ~AttentionImpl() { _tile_release(); }
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if (q_head_num > AMX_TILE_ROW_NUM) {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    } else {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    }
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * head_dim;
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * (AMX_TILE_ROW_BYTES / 4);
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return block_size * HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      scalar_t* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, const float scale) {
+    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
+    constexpr int64_t head_elem_num_pre_block =
+        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
+
+    int32_t idx = 0;
+    int8_t* __restrict__ q_buffer_iter = reinterpret_cast<int8_t*>(q_buffer);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num;
+         ++q_num_idx, src += q_num_stride) {
+      scalar_t* __restrict__ src_iter = src;
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv;
+           ++q_head_idx, src_iter += q_head_stride) {
+        vec_op::unroll_loop<int32_t, head_size_block_num>(
+            [&](int32_t head_size_block_idx) {
+              // Use INT8Vec64 for 64 bytes block
+              vec_op::INT8Vec64 vec(src_iter + head_size_block_idx *
+                                                   head_elem_num_pre_block);
+              vec.save(q_buffer_iter + head_size_block_idx * AMX_TILE_BYTES);
+            });
+
+        ++idx;
+        q_buffer_iter += AMX_TILE_ROW_BYTES;
+        if ((idx & (AMX_TILE_ROW_NUM - 1)) == 0) {
+          // head is in another amx tile
+          q_buffer_iter -= AMX_TILE_ROW_NUM * AMX_TILE_ROW_BYTES;
+          q_buffer_iter += head_size_block_num * AMX_TILE_BYTES;
+        }
+      }
+    }
+  }
+
+  // reshape KV to AMX friendly layout
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+    // For AMX 2D tiles, size of each line is 64 bytes
+    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
+    // For AMX B martix, N always is 16
+    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
+    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
+    // For now suppose block_size is divisible by amx_tile_column_num
+    TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
+
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          // Head elements should be packed as quand-words and stored in token
+          // groups with (quadword_stride/4) tokens
+          constexpr int64_t token_num_per_group = amx_tile_row_size / 4;
+          static_assert(head_dim % (4 / sizeof(scalar_t)) == 0);
+          constexpr int64_t quadword_num = head_dim / (4 / sizeof(scalar_t));
+          const int32_t* key_start_quadword_ptr =
+              reinterpret_cast<const int32_t*>(
+                  key + token_idx * key_token_num_stride +
+                  head_idx * key_head_num_stride);
+          const int64_t group_idx = block_offset / token_num_per_group;
+          const int64_t group_offset = block_offset % token_num_per_group;
+          constexpr int64_t quadword_num_per_group =
+              token_num_per_group * quadword_num;
+          int32_t* key_cache_start_ptr =
+              reinterpret_cast<int32_t*>(key_cache +
+                                         block_idx * num_blocks_stride +
+                                         head_idx * cache_head_num_stride) +
+              group_idx * quadword_num_per_group + group_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; j < quadword_num;
+               i += token_num_per_group, ++j) {
+            key_cache_start_ptr[i] = key_start_quadword_ptr[j];
+          }
+        }
+        {
+          // Write Value
+          // Different from Key, block_size dimension is packed rather than
+          // head_size dimension block_size dimension is packed as quand-words;
+          constexpr int64_t token_num_per_sub_group = 4 / sizeof(scalar_t);
+          const int64_t token_num_per_group = block_size;
+          constexpr int64_t head_elems_per_group = amx_b_tile_n_size;
+          const int64_t group_size = token_num_per_group * head_elems_per_group;
+          // For now suppose head_dim is divisible by amx_b_tile_n_size
+          static_assert(head_dim % head_elems_per_group == 0);
+          constexpr int64_t group_num = head_dim / head_elems_per_group;
+          const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+          const int64_t sub_group_offset =
+              block_offset % token_num_per_sub_group;
+
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride +
+              sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
+              sub_group_offset;
+
+          for (int64_t i = 0; i < group_num; ++i) {
+#pragma GCC unroll head_elems_per_group
+            for (int64_t j = 0, k = 0; j < head_elems_per_group;
+                 ++j, k += token_num_per_sub_group) {
+              value_cache_start_ptr[k] = value_start_ptr[j];
+            }
+            value_start_ptr += head_elems_per_group;
+            value_cache_start_ptr += group_size;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t current_q_head_num_;
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
new file mode 100644
index 0000000000000..8f4c780998020
--- /dev/null
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -0,0 +1,1977 @@
+#ifndef CPU_ATTN_HPP
+#define CPU_ATTN_HPP
+
+#include <unistd.h>
+#include <type_traits>
+#include <cstddef>
+
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "cpu_attn_macros.h"
+
+namespace cpu_attention {
+enum class ISA { AMX, VEC, VEC16 };
+
+template <ISA isa, typename scalar_t, int64_t head_dim>
+class AttentionImpl {};
+
+struct AttentionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_num;
+  int32_t kv_split_pos_start;
+  int32_t kv_split_pos_end;
+
+  int64_t total_kv_len;
+  int32_t split_id;
+  int32_t local_split_id;
+
+  AttentionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t kv_split_pos_start,
+                         const int32_t kv_split_pos_end)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_num(0),
+        kv_split_pos_start(kv_split_pos_start),
+        kv_split_pos_end(kv_split_pos_end),
+        total_kv_len(0),
+        split_id(-1),
+        local_split_id(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_num: " << q_token_num << ",\n";
+    ss << "kv_split_pos_start: " << kv_split_pos_start << ",\n";
+    ss << "kv_split_pos_end: " << kv_split_pos_end << ",\n";
+    ss << "total_kv_len: " << total_kv_len << ",\n";
+    ss << "split_id: " << split_id << ",\n";
+    ss << "local_split_id: " << local_split_id << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct ReductionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_id_num;
+  int32_t split_start_id;
+  int32_t split_num;
+
+  ReductionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t q_token_id_num,
+                         const int32_t split_start_id)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_id_num(q_token_id_num),
+        split_start_id(split_start_id),
+        split_num(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_id_num: " << q_token_id_num << ",\n";
+    ss << "split_start_id: " << split_start_id << ",\n";
+    ss << "split_num: " << split_num << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct AttentionMetadata {
+  std::atomic_int64_t counter;
+  char _padding1[56];
+  ISA isa;
+  int32_t workitem_group_num;
+  int32_t reduction_item_num;
+  int32_t reduction_split_num;
+  int32_t thread_num;
+  int32_t effective_thread_num;  // non-zero item num in workitem_num_per_thread
+  int32_t split_kv_q_token_num_threshold;
+  int64_t attention_scratchpad_size_per_thread;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  AttentionWorkItemGroup* workitem_groups_ptr;
+  ReductionWorkItemGroup* reduction_items_ptr;
+  int32_t cu_workitem_num_per_thread[1025] = {
+      0};  // prefix sum of workitem_num_per_thread
+  char _padding2[56];
+
+  AttentionMetadata(ISA isa, int32_t workitem_group_num,
+                    int32_t reduction_item_num, int32_t reduction_split_num,
+                    int32_t split_kv_q_token_num_threshold)
+      : isa(isa),
+        workitem_group_num(workitem_group_num),
+        reduction_item_num(reduction_item_num),
+        reduction_split_num(reduction_split_num),
+        thread_num(omp_get_max_threads()),
+        effective_thread_num(thread_num),
+        split_kv_q_token_num_threshold(split_kv_q_token_num_threshold),
+        attention_scratchpad_size_per_thread(0),
+        reduction_scratchpad_size_per_kv_head(0),
+        workitem_groups_ptr(
+            (AttentionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata))),
+        reduction_items_ptr(
+            (ReductionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata) +
+                                      workitem_group_num *
+                                          sizeof(AttentionWorkItemGroup))),
+        counter(0) {
+    TORCH_CHECK_LE(thread_num, 1024);
+    static_assert(sizeof(AttentionMetadata) % 64 == 0);
+    TORCH_CHECK(reinterpret_cast<size_t>(this) % 64 == 0);
+  }
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+
+  void print() const {
+    std::stringstream ss;
+    ss << "ISA: ";
+    switch (isa) {
+      case ISA::AMX:
+        ss << "AMX, ";
+        break;
+      case ISA::VEC:
+        ss << "VEC, ";
+        break;
+    }
+    ss << "workitem_group_num: " << workitem_group_num
+       << ", reduction_item_num: " << reduction_item_num
+       << ", reduction_split_num: " << reduction_split_num
+       << ", thread_num: " << thread_num
+       << ", effective_thread_num: " << effective_thread_num
+       << ", attention_scratchpad_size_per_thread: "
+       << attention_scratchpad_size_per_thread
+       << ", reduction_scratchpad_size_per_kv_head: "
+       << reduction_scratchpad_size_per_kv_head << ", workitem groups:\n";
+    for (int32_t i = 0; i < workitem_group_num; ++i) {
+      ss << (workitem_groups_ptr + i)->to_string() << ",\n";
+    }
+
+    ss << "cu_workitem_num_per_thread: [";
+    for (int32_t i = 0; i < thread_num + 1; ++i) {
+      ss << cu_workitem_num_per_thread[i] << ", ";
+    }
+    ss << "]\n";
+
+    ss << "reduction items: \n";
+
+    for (int32_t i = 0; i < reduction_item_num; ++i) {
+      ss << (reduction_items_ptr + i)->to_string() << ",\n";
+    }
+
+    std::printf("%s", ss.str().c_str());
+  }
+};
+
+// Thread attention scratchpad contains:
+//  - Q: q_tile_size * head_dim * q_buffer_elem_size, gather Q heads, especially
+//  for GQA
+//  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size, logits
+//  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
+//  * q_tile_size * 4, partial output, max + sum (float)
+// Reduction scratchpad contains:
+//  - flags: bool array to indicate wether the split is finished
+//  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
+//  - max, sum: 2 * split_num * q_tile_size * 4
+class AttentionScratchPad {
+ public:
+  AttentionScratchPad(int64_t thread_id,
+                      const AttentionMetadata& attention_metadata,
+                      void* scratchpad_ptr)
+      : thread_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            thread_id *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            attention_metadata.thread_num *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_size_per_kv_head(
+            attention_metadata.reduction_scratchpad_size_per_kv_head) {}
+
+  // for attention
+  void update(const int64_t head_dim, const int64_t q_buffer_elem_size,
+              const int64_t logits_buffer_elem_size,
+              const int64_t output_buffer_elem_size,
+              const int64_t max_num_q_per_iter, const int64_t q_head_tile_size,
+              const int64_t kv_tile_size) {
+    int64_t buffer_offset = 0;
+    q_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_q_buffer_size(q_head_tile_size, head_dim, q_buffer_elem_size);
+    logits_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_logits_buffer_size(max_num_q_per_iter, kv_tile_size,
+                                              logits_buffer_elem_size);
+    output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_buffer_size(
+        q_head_tile_size, head_dim, output_buffer_elem_size);
+    max_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_max_sum_buffer_size(q_head_tile_size);
+    sum_buffer_offset_ = buffer_offset;
+  }
+
+  // for reduction
+  void update(const int32_t kv_head_idx, const int32_t total_split_num,
+              const int64_t head_dim, const int64_t q_head_tile_size,
+              const int64_t output_buffer_elem_size) {
+    int64_t buffer_offset = kv_head_idx * reduction_scratchpad_size_per_kv_head;
+    reduce_flag_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_flag_buffer_size(total_split_num);
+    reduce_output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_output_buffer_size(
+        total_split_num, q_head_tile_size, head_dim, output_buffer_elem_size);
+    reduce_max_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_reduce_max_sum_buffer_size(total_split_num, q_head_tile_size);
+    reduce_sum_buffer_offset_ = buffer_offset;
+  }
+
+  template <typename T>
+  T* get_q_buffer() {
+    return reinterpret_cast<T*>(thread_scratchpad_ptr + q_buffer_offset_);
+  }
+
+  float* get_logits_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    logits_buffer_offset_);
+  }
+
+  float* get_output_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    output_buffer_offset_);
+  }
+
+  float* get_max_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + max_buffer_offset_);
+  }
+
+  float* get_sum_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + sum_buffer_offset_);
+  }
+
+  volatile bool* get_reduce_flag_buffer() {
+    return reinterpret_cast<volatile bool*>(reduction_scratchpad_ptr +
+                                            reduce_flag_buffer_offset_);
+  }
+
+  float* get_reduce_output_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_output_buffer_offset_);
+  }
+
+  float* get_reduce_max_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_max_buffer_offset_);
+  }
+
+  float* get_reduce_sum_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_sum_buffer_offset_);
+  }
+
+  int64_t get_thread_scratchpad_size() const {
+    return 2 * sum_buffer_offset_ - max_buffer_offset_;
+  }
+
+  int64_t get_reduction_scratchpad_size() const {
+    return 2 * reduce_sum_buffer_offset_ - reduce_max_buffer_offset_;
+  }
+
+ private:
+  static int64_t round_to_64(const int64_t num) {
+    return ((num + 63) >> 6) << 6;
+  }
+
+  static int64_t calcu_q_buffer_size(const int64_t q_tile_size,
+                                     const int64_t head_dim,
+                                     const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_logits_buffer_size(const int64_t max_num_q_per_iter,
+                                          const int64_t k_tile_size,
+                                          const int64_t elem_size) {
+    return round_to_64(elem_size * max_num_q_per_iter * k_tile_size);
+  }
+
+  static int64_t calcu_partial_output_buffer_size(const int64_t q_tile_size,
+                                                  const int64_t head_dim,
+                                                  const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_partial_output_max_sum_buffer_size(
+      const int64_t q_tile_size) {
+    return round_to_64(q_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_flag_buffer_size(const int64_t total_split_num) {
+    return round_to_64(total_split_num * sizeof(bool));
+  }
+
+  static int64_t calcu_reduce_max_sum_buffer_size(
+      const int64_t total_split_num, const int32_t q_head_tile_size) {
+    return round_to_64(total_split_num * q_head_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_output_buffer_size(
+      const int64_t total_split_num, const int64_t q_head_tile_size,
+      const int64_t head_dim, const int64_t output_buffer_elem_size) {
+    return round_to_64(total_split_num * q_head_tile_size * head_dim *
+                       output_buffer_elem_size);
+  }
+
+ private:
+  int8_t* thread_scratchpad_ptr;
+  int8_t* reduction_scratchpad_ptr;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  // attention buffers
+  int64_t q_buffer_offset_;
+  int64_t logits_buffer_offset_;
+  int64_t output_buffer_offset_;
+  int64_t max_buffer_offset_;
+  int64_t sum_buffer_offset_;
+  // reduction buffers
+  int64_t reduce_flag_buffer_offset_;
+  int64_t reduce_output_buffer_offset_;
+  int64_t reduce_max_buffer_offset_;
+  int64_t reduce_sum_buffer_offset_;
+};
+
+class AttentionScheduler {
+ public:
+  struct ScheduleInput {
+    int32_t num_reqs;
+    int32_t elem_size;
+    int32_t q_buffer_elem_size;
+    int32_t logits_buffer_elem_size;
+    int32_t output_buffer_elem_size;
+    int32_t num_heads_q;
+    int32_t num_heads_kv;
+    int32_t head_dim;
+    int32_t* query_start_loc;
+    int32_t* seq_lens;
+    int32_t left_sliding_window_size;
+    int32_t right_sliding_window_size;
+    bool casual;
+    cpu_attention::ISA isa;
+    int32_t max_num_q_per_iter;  // max Q head num can be hold in registers
+    int32_t kv_block_alignment;  // context length alignment requirement
+    bool enable_kv_split;
+  };
+
+  static constexpr int32_t MaxQTileIterNum = 128;
+
+  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+
+  torch::Tensor schedule(const ScheduleInput& input) const {
+    const bool casual = input.casual;
+    const int32_t thread_num = omp_get_max_threads();
+    const int64_t cache_size = get_available_l2_size();
+    const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
+    const int32_t kv_len_alignment = input.kv_block_alignment;
+    int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
+    const bool use_gqa = (max_num_q_per_iter % q_head_per_kv == 0);
+    if (!use_gqa) {
+      q_head_per_kv = 1;  // fallback to MHA
+    }
+    const int32_t min_split_kv_len =
+        ((max_num_q_per_iter * 4 + kv_len_alignment - 1) / kv_len_alignment) *
+        kv_len_alignment;
+    const int32_t max_num_q_token_per_iter = max_num_q_per_iter / q_head_per_kv;
+    const int64_t default_tile_size = calcu_default_tile_size(
+        cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+        input.logits_buffer_elem_size, input.output_buffer_elem_size,
+        max_num_q_per_iter, max_num_q_per_iter);
+    const int32_t default_tile_token_num = default_tile_size / q_head_per_kv;
+    const int32_t split_kv_q_token_num_threshold =
+        input.enable_kv_split ? 1 : 0;
+    const int32_t left_sliding_window_size = input.left_sliding_window_size;
+    const int32_t right_sliding_window_size = input.right_sliding_window_size;
+    TORCH_CHECK_LE(split_kv_q_token_num_threshold * q_head_per_kv, 16);
+
+    // get total kv len
+    int64_t total_kv_len = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        total_kv_len += curr_kv_len;
+      }
+    }
+    const int64_t kv_len_per_thread =
+        (((total_kv_len / thread_num) + kv_len_alignment - 1) /
+         kv_len_alignment) *
+        kv_len_alignment * (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    std::vector<AttentionWorkItemGroup> workitems;
+    std::vector<ReductionWorkItemGroup> reduce_workitems;
+    workitems.reserve(1024);
+    reduce_workitems.reserve(1024);
+    std::vector<int32_t> workitem_num_per_thread(thread_num, 0);
+
+    // split tasks
+    int32_t curr_thread_id = 0;
+    int64_t remaining_kv_len = kv_len_per_thread;
+    int32_t cum_split_num = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+      int32_t local_split_id = 0;
+
+      AttentionWorkItemGroup curr_workitem(req_id, 0, 0, seq_len);
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        int32_t kv_token_pos_start = aligned_kv_tile_pos_left;
+
+        while (curr_kv_len > 0) {
+          if (curr_kv_len <= (remaining_kv_len + min_split_kv_len) ||
+              curr_thread_id == (thread_num - 1)) {
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+
+            if (remaining_kv_len < 0) {
+              // stop to accept more workitems
+              remaining_kv_len -= min_split_kv_len;
+            }
+
+            if (curr_workitem.kv_split_pos_start != 0) {
+              // got a partial kv spilt, need to create a single workitem
+              curr_workitem.split_id = cum_split_num;
+              curr_workitem.local_split_id = local_split_id;
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              ++reduce_workitems.back().split_num;
+              ++cum_split_num;
+
+              curr_workitem = AttentionWorkItemGroup(
+                  req_id, token_id + max_num_q_token_per_iter, 0, seq_len);
+            }
+
+            break;
+          }
+
+          if (remaining_kv_len < min_split_kv_len &&
+              (curr_workitem.total_kv_len > 0 ||
+               workitem_num_per_thread[curr_thread_id] > 0)) {
+            // remaining_kv_len is too short, and have allocated workitems, just
+            // leave to next thread
+            if (curr_workitem.total_kv_len > 0) {
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+            }
+
+            // switch to next thread
+            ++curr_thread_id;
+            remaining_kv_len = kv_len_per_thread;
+
+            // retry this iteration
+            continue;
+          }
+
+          // only split tail splits with q_tile_token_num <=
+          // split_kv_q_token_num_threshold
+          if (token_id + max_num_q_token_per_iter < q_token_num ||
+              q_tile_token_num > split_kv_q_token_num_threshold) {
+            // if requires a new q tile iteration and already has workitems,
+            // leave this workitem to next thread
+            if (curr_workitem.q_token_num % default_tile_token_num == 0 &&
+                (curr_workitem.total_kv_len > 0 ||
+                 workitem_num_per_thread[curr_thread_id] > 0)) {
+              if (curr_workitem.total_kv_len > 0) {
+                workitems.emplace_back(curr_workitem);
+                ++workitem_num_per_thread[curr_thread_id];
+              }
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+
+              // switch to next thread
+              ++curr_thread_id;
+              remaining_kv_len = kv_len_per_thread;
+            }
+
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+            break;
+          }
+
+          // split kv
+          if (curr_workitem.total_kv_len > 0) {
+            // write back curr workitem
+            workitems.emplace_back(curr_workitem);
+            ++workitem_num_per_thread[curr_thread_id];
+          }
+
+          if (kv_token_pos_start == aligned_kv_tile_pos_left) {
+            // first split, init the workitem
+            reduce_workitems.emplace_back(ReductionWorkItemGroup(
+                req_id, token_id, q_tile_token_num, cum_split_num));
+          }
+
+          int32_t spilt_size =
+              std::min(std::max(remaining_kv_len, (int64_t)min_split_kv_len),
+                       (int64_t)curr_kv_len);
+          curr_workitem =
+              AttentionWorkItemGroup(req_id, token_id, kv_token_pos_start,
+                                     kv_token_pos_start + spilt_size);
+          curr_workitem.q_token_num += q_tile_token_num;
+          curr_workitem.total_kv_len += spilt_size;
+          curr_workitem.split_id = cum_split_num;
+          curr_workitem.local_split_id = local_split_id;
+          workitems.emplace_back(curr_workitem);
+          ++workitem_num_per_thread[curr_thread_id];
+          ++reduce_workitems.back().split_num;
+          ++cum_split_num;
+          ++local_split_id;
+
+          kv_token_pos_start += spilt_size;
+          curr_kv_len -= spilt_size;
+          curr_workitem = AttentionWorkItemGroup(req_id, token_id,
+                                                 kv_token_pos_start, seq_len);
+
+          // switch to next thread
+          ++curr_thread_id;
+          remaining_kv_len = kv_len_per_thread;
+        }
+      }
+
+      if (curr_workitem.total_kv_len > 0) {
+        // write back curr workitem
+        workitems.emplace_back(curr_workitem);
+        ++workitem_num_per_thread[curr_thread_id];
+      }
+    }
+
+    int64_t metadata_tensor_size =
+        sizeof(AttentionMetadata) +
+        workitems.size() * sizeof(AttentionWorkItemGroup) +
+        reduce_workitems.size() * sizeof(ReductionWorkItemGroup);
+    auto options =
+        torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    torch::Tensor metadata_tensor =
+        torch::empty({metadata_tensor_size}, options);
+    AttentionMetadata* metadata_ptr = new (metadata_tensor.data_ptr())
+        AttentionMetadata(input.isa, workitems.size(), reduce_workitems.size(),
+                          cum_split_num, split_kv_q_token_num_threshold);
+    AttentionWorkItemGroup* workitem_groups_ptr =
+        metadata_ptr->workitem_groups_ptr;
+    ReductionWorkItemGroup* reduction_items_ptr =
+        metadata_ptr->reduction_items_ptr;
+    std::memcpy(workitem_groups_ptr, workitems.data(),
+                workitems.size() * sizeof(AttentionWorkItemGroup));
+    std::memcpy(reduction_items_ptr, reduce_workitems.data(),
+                reduce_workitems.size() * sizeof(ReductionWorkItemGroup));
+
+    int32_t effective_thread_num = 0;
+    for (; effective_thread_num < thread_num; ++effective_thread_num) {
+      if (workitem_num_per_thread[effective_thread_num] == 0) {
+        break;
+      }
+    }
+
+    std::memcpy(metadata_ptr->cu_workitem_num_per_thread + 1,
+                workitem_num_per_thread.data(),
+                workitem_num_per_thread.size() * sizeof(int32_t));
+    for (int32_t i = 1; i <= thread_num; ++i) {
+      metadata_ptr->cu_workitem_num_per_thread[i] +=
+          metadata_ptr->cu_workitem_num_per_thread[i - 1];
+    }
+    metadata_ptr->effective_thread_num = effective_thread_num;
+
+    {
+      // when q_tile_size = max_num_q_per_iter, requires max
+      // attention_scratchpad_size
+      AttentionScratchPad sc(0, *metadata_ptr, 0x0);
+      int64_t n = AttentionScheduler::calcu_tile_size_with_constant_q(
+          cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+          input.logits_buffer_elem_size, input.output_buffer_elem_size,
+          max_num_q_per_iter, kv_len_alignment, max_num_q_per_iter, true);
+      sc.update(input.head_dim, input.q_buffer_elem_size,
+                input.logits_buffer_elem_size, input.output_buffer_elem_size,
+                max_num_q_per_iter, max_num_q_per_iter, n);
+      metadata_ptr->attention_scratchpad_size_per_thread =
+          ((sc.get_thread_scratchpad_size() + 63) / 64) * 64;
+
+      sc.update(0, metadata_ptr->reduction_split_num, input.head_dim,
+                q_head_per_kv * split_kv_q_token_num_threshold,
+                input.output_buffer_elem_size);
+      metadata_ptr->reduction_scratchpad_size_per_kv_head =
+          ((sc.get_reduction_scratchpad_size() + 63) / 64) * 64;
+    }
+    int64_t scratchpad_size =
+        metadata_ptr->attention_scratchpad_size_per_thread *
+            metadata_ptr->thread_num +
+        metadata_ptr->reduction_scratchpad_size_per_kv_head *
+            (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+        scratchpad_size);
+
+    // metadata_ptr->print();
+
+    // test out of boundary access
+    // {
+    //     float* cache_ptr =
+    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
+    //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
+    //     }
+    // }
+
+    return metadata_tensor;
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> calcu_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t q_left_pos,
+      int32_t q_right_pos, int32_t sliding_window_left,
+      int32_t sliding_window_right) {
+    if (sliding_window_left != -1) {
+      kv_left_pos = std::max(kv_left_pos, q_left_pos - sliding_window_left);
+    }
+    if (sliding_window_right != -1) {
+      kv_right_pos = std::min(kv_right_pos, q_right_pos + sliding_window_right);
+    }
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> align_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t align_factor) {
+    kv_left_pos = (kv_left_pos / align_factor) * align_factor;
+    kv_right_pos =
+        ((kv_right_pos + align_factor - 1) / align_factor) * align_factor;
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  static int64_t calcu_default_tile_size(int64_t cache_size, int64_t head_dim,
+                                         int64_t elem_size,
+                                         int64_t q_buffer_elem_size,
+                                         int64_t logits_buffer_elem_size,
+                                         int64_t output_buffer_elem_size,
+                                         int64_t max_num_q_per_iter,
+                                         int64_t round_size) {
+    // For CPU, different from CUDA, Q@K^T results should also be hold in cache,
+    // using float32. Intermediate outputs should be float32 to be compatible
+    // with AMX Then the cache includes:
+    //  - Q: q_tile_size * head_dim * q_buffer_elem_size
+    //  - K, V: 2 * k_tile_size * head_dim * elem_size
+    //  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size
+    //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size
+    // By default, let tile_size = q_tile_size = k_tile_size. To record
+    // is_first_iter states in a static array, require the default tile <= 128 *
+    // max_num_q_per_iter
+
+    int64_t tile_size =
+        cache_size / (head_dim * (q_buffer_elem_size + 2 * elem_size +
+                                  output_buffer_elem_size) +
+                      max_num_q_per_iter * logits_buffer_elem_size);
+    tile_size = std::min(tile_size, MaxQTileIterNum * max_num_q_per_iter);
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t calcu_tile_size_with_constant_q(
+      int64_t cache_size, int64_t head_dim, int64_t elem_size,
+      int64_t q_buffer_elem_size, int64_t logits_buffer_elem_size,
+      int64_t output_buffer_elem_size, int64_t max_num_q_per_iter,
+      int64_t round_size, int64_t q_tile_size, bool one_round) {
+    // calculate tile_size with known q_tile_size
+    // If one_round is True, the outer Q tile loop time is 1, then the K,V will
+    // not be included in the cache
+    int64_t tile_size;
+    if (one_round) {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter);
+    } else {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter +
+           2 * head_dim * elem_size);
+    }
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t get_available_l2_size() {
+    static int64_t size = []() {
+      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+      TORCH_CHECK_NE(l2_cache_size, -1);
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+    }();
+    return size;
+  }
+
+ private:
+  int64_t available_cache_size_;
+};
+
+struct AttentionInput {
+  AttentionMetadata* metadata;
+  int32_t num_tokens;
+  int32_t num_heads;
+  int32_t num_kv_heads;
+  int32_t block_size;
+  void* query;
+  int64_t query_num_tokens_stride;
+  int64_t query_num_heads_stride;
+  int64_t cache_num_blocks_stride;
+  int64_t cache_num_kv_heads_stride;
+  int64_t blt_num_tokens_stride;
+  void* key_cache;
+  void* value_cache;
+  void* output;
+  int32_t* query_start_loc;
+  int32_t* seq_lens;
+  int32_t* block_table;
+  float* alibi_slopes;
+  c10::BFloat16* s_aux;
+  float scale;
+  bool causal;
+  int32_t sliding_window_left;
+  int32_t sliding_window_right;
+  float softcap;
+};
+
+#define DEFINE_CPU_ATTENTION_PARAMS                                         \
+  q_buffer_t *__restrict__ q_heads_buffer,                                  \
+      kv_cache_t *__restrict__ k_head_cache_ptr,                            \
+      kv_cache_t *__restrict__ v_head_cache_ptr,                            \
+      logits_buffer_t *__restrict__ logits_buffer,                          \
+      float *__restrict__ partial_q_buffer, float *__restrict__ max_buffer, \
+      float *__restrict__ sum_buffer, int32_t *__restrict__ block_table,    \
+      const int32_t kv_tile_start_pos, const int32_t kv_tile_end_pos,       \
+      const int32_t kv_tile_token_num,                                      \
+      const int64_t kv_cache_num_blocks_stride, const int32_t q_head_num,   \
+      const int32_t q_token_num, const int32_t q_tile_start_pos,            \
+      const int32_t q_heads_per_kv, const int32_t block_size,               \
+      const int32_t left_window_size, const int32_t right_window_size,      \
+      float scale, const float softcap_scale,                               \
+      const float *__restrict__ alibi_slopes, const bool is_first_iter,     \
+      const bool use_sink, const bool debug_info
+
+#define CPU_ATTENTION_PARAMS                                                  \
+  q_heads_buffer, k_head_cache_ptr, v_head_cache_ptr, logits_buffer,          \
+      partial_q_buffer, max_buffer, sum_buffer, block_table,                  \
+      kv_tile_start_pos, kv_tile_end_pos, kv_tile_token_num,                  \
+      kv_cache_num_blocks_stride, q_head_num, q_token_num, q_tile_start_pos,  \
+      q_heads_per_kv, block_size, left_window_size, right_window_size, scale, \
+      softcap_scale, alibi_slopes, is_first_iter, use_sink, debug_info
+
+enum class AttentionGemmPhase { QK, PV };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+template <typename attention_impl_t>
+class AttentionMainLoop {
+ public:
+  using query_t = typename attention_impl_t::query_t;
+  using q_buffer_t = typename attention_impl_t::q_buffer_t;
+  using kv_cache_t = typename attention_impl_t::kv_cache_t;
+  using logits_buffer_t = typename attention_impl_t::logits_buffer_t;
+  using partial_output_buffer_t =
+      typename attention_impl_t::partial_output_buffer_t;
+  using prob_buffer_t = typename attention_impl_t::prob_buffer_t;
+
+  static constexpr int64_t max_q_head_num_per_iter =
+      attention_impl_t::MaxQHeadNumPerIteration;
+  static constexpr int64_t blocksize_alignment =
+      attention_impl_t::BlockSizeAlignment;
+  static constexpr int64_t headdim_alignment =
+      attention_impl_t::HeadDimAlignment;
+  static constexpr int64_t head_dim = attention_impl_t::HeadDim;
+  static constexpr ISA ISAType = attention_impl_t::ISAType;
+  static constexpr bool scale_on_logits =
+      attention_impl_t::scale_on_logits;  // apply scale on logits, otherwise
+                                          // apply scale on q_buffer
+
+  template <typename tile_gemm_t>
+  class Attention {
+   public:
+    // Args:
+    //  - q_heads_buffer: [MaxQHeadNumPerIteration, head_dim]
+    //  - k_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - v_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - logits_buffer: [MaxQHeadNumPerIteration, kv_tile_token_num], store Q@K
+    //  - logits partial_q_buffer: [MaxQHeadNumPerIteration, head_dim], store
+    //  partial output
+    //  - max_buffer: [MaxQHeadNumPerIteration, 1], store max logits
+    //  - sum_buffer: [MaxQHeadNumPerIteration, 1], store sum of exp
+    //  - block_table
+    //  - kv_tile_start_pos: start position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_end_pos: end position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_token_num: KV token num, aligned to BlockSizeAlignment
+    //  - kv_cache_num_blocks_stride
+    //  - q_head_num: head num of q_tile
+    //  - q_token_num: token num of q_tile, should be q_head_num /
+    //  q_heads_per_kv
+    //  - q_tile_start_pos: start pos of the first token in q_heads_buffer
+    //  - q_heads_per_kv
+    //  - block_size
+    //  - left_window_size
+    //  - right_window_size
+    //  - scale
+    //  - softcap_scale
+    //  - alibi_slopes
+    //  - is_first_iter
+    //  - use_sink
+    //  - debug_info
+    void operator()(DEFINE_CPU_ATTENTION_PARAMS) {
+      // k_cache_token_group_stride: stride of K cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t k_cache_token_group_stride =
+          attention_impl_t::k_cache_token_group_stride(block_size);
+      // v_cache_token_group_stride: stride of V cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t v_cache_token_group_stride =
+          attention_impl_t::v_cache_token_group_stride(block_size);
+      // v_cache_head_group_stride: stride of V cache when move to next
+      // HeadDimAlignment head dims in a block
+      const int64_t v_cache_head_group_stride =
+          attention_impl_t::v_cache_head_group_stride(block_size);
+      const int32_t token_group_num = kv_tile_token_num / blocksize_alignment;
+      const int32_t token_group_num_per_block =
+          block_size / blocksize_alignment;
+      const int32_t start_block_idx = kv_tile_start_pos / block_size;
+      const int32_t start_block_offset = kv_tile_start_pos % block_size;
+      const int32_t start_block_group_offset =
+          start_block_offset / blocksize_alignment;
+      const int32_t end_block_idx =
+          (kv_tile_start_pos + kv_tile_token_num - 1) / block_size + 1;
+
+      // compute Q@K logits
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * k_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        logits_buffer_t* curr_logits_buffer = logits_buffer;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* k_cache_block_ptr =
+              k_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+
+          for (int32_t block_group_idx = 0;
+               block_group_idx < curr_group_num_in_block; ++block_group_idx) {
+            // logits_tile = q_tile @ k_tile, [MaxQHeadNumPerIteration,
+            // BlockSizeAlignment] = [MaxQHeadNumPerIteration, head_dim] @
+            // [head_dim, BlockSizeAlignment]
+
+            // By default, logits_buffer, q_buffer and k_cache are row-major,
+            // but may be packed by ISA implementation.
+            tile_gemm_t::template gemm<AttentionGemmPhase::QK, head_dim>(
+                q_head_num, q_heads_buffer, k_cache_block_ptr,
+                curr_logits_buffer, head_dim, block_size, kv_tile_token_num,
+                block_size, head_dim, false);
+
+            if constexpr (scale_on_logits) {
+              float* __restrict__ scale_curr_logits_buffer = curr_logits_buffer;
+              vec_op::FP32Vec16 scale_vec(scale);
+              for (int32_t i = 0; i < q_head_num; ++i) {
+                static_assert(blocksize_alignment % 16 == 0);
+                constexpr int32_t vec_num = blocksize_alignment / 16;
+                vec_op::unroll_loop<int32_t, vec_num>([&](int32_t vec_idx) {
+                  vec_op::FP32Vec16 vec(scale_curr_logits_buffer +
+                                        vec_idx * 16);
+                  vec = vec * scale_vec;
+                  vec.save(scale_curr_logits_buffer + vec_idx * 16);
+                });
+                scale_curr_logits_buffer += kv_tile_token_num;
+              }
+            }
+
+            // Move buffer ptrs
+            k_cache_block_ptr += k_cache_token_group_stride;
+            curr_logits_buffer += blocksize_alignment;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+        }
+      }
+
+      // process logits
+      {
+        // if (debug_info){
+        //     print_logits("raw logits", logits_buffer, q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num);
+        // }
+
+        if (softcap_scale != 0.0f) {
+          apply_softcap(logits_buffer, kv_tile_token_num, q_head_num,
+                        kv_tile_token_num, softcap_scale);
+          // print_logits("softcap raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        if (alibi_slopes != nullptr) {
+          apply_alibi_slopes(logits_buffer, alibi_slopes, kv_tile_token_num,
+                             q_tile_start_pos, kv_tile_start_pos, q_token_num,
+                             kv_tile_token_num, q_heads_per_kv);
+
+          // print_logits("alibi raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        apply_mask(logits_buffer, kv_tile_token_num, q_tile_start_pos,
+                   kv_tile_start_pos, kv_tile_end_pos, q_token_num,
+                   q_heads_per_kv, left_window_size, right_window_size);
+
+        // if (debug_info){
+        // print_logits("masked logits", logits_buffer, q_head_num,
+        // kv_tile_token_num, kv_tile_token_num);
+        // print_logits("old_max", max_buffer, 1, q_head_num, q_head_num);
+        // print_logits("old_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+
+        apply_softmax(logits_buffer, partial_q_buffer, max_buffer, sum_buffer,
+                      kv_tile_token_num, q_head_num, kv_tile_token_num,
+                      is_first_iter, use_sink);
+
+        // if (debug_info){
+        //     print_logits("softmax logits",
+        //     reinterpret_cast<prob_buffer_t*>(logits_buffer), q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num * sizeof(logits_buffer_t) /
+        //     sizeof(prob_buffer_t));
+        //     print_logits("new_max", max_buffer, 1, q_head_num, q_head_num);
+        //     print_logits("new_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+      }
+
+      // compute P@V
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * v_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        int32_t head_dim_group_num = head_dim / headdim_alignment;
+        prob_buffer_t* curr_prob_buffer =
+            reinterpret_cast<prob_buffer_t*>(logits_buffer);
+        int64_t prob_buffer_stride =
+            kv_tile_token_num *
+            (sizeof(logits_buffer_t) / sizeof(prob_buffer_t));
+        partial_output_buffer_t* curr_partial_q_buffer = partial_q_buffer;
+        bool accum_c = !is_first_iter;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* v_cache_block_ptr =
+              v_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+          int32_t curr_token_num =
+              curr_group_num_in_block * blocksize_alignment;
+
+          for (int32_t head_dim_group_idx = 0;
+               head_dim_group_idx < head_dim_group_num; ++head_dim_group_idx) {
+            // output_tile = p_tile @ v_tile, [MaxQHeadNumPerIteration,
+            // HeadDimAlignment] = [MaxQHeadNumPerIteration, block_size] @
+            // [block_size, HeadDimAlignment]
+            tile_gemm_t::template gemm<AttentionGemmPhase::PV, -1>(
+                q_head_num, curr_prob_buffer, v_cache_block_ptr,
+                curr_partial_q_buffer, prob_buffer_stride, head_dim, head_dim,
+                block_size, curr_token_num, accum_c);
+
+            // Update
+            curr_partial_q_buffer += headdim_alignment;
+            v_cache_block_ptr += v_cache_head_group_stride;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+          curr_prob_buffer += curr_token_num;
+          curr_partial_q_buffer = partial_q_buffer;
+          accum_c = true;
+        }
+      }
+      //   if (debug_info) {
+      //     print_logits("output", partial_q_buffer, q_head_num, head_dim,
+      //     head_dim);
+      //   }
+    }
+
+    void apply_mask(logits_buffer_t* __restrict__ logits_buffer,
+                    const int64_t logits_buffer_stride,
+                    const int32_t q_tile_start_pos,
+                    const int32_t kv_tile_start_pos,
+                    const int32_t kv_tile_end_pos, const int32_t q_token_num,
+                    const int32_t q_heads_per_kv,
+                    const int32_t sliding_window_left,
+                    const int32_t sliding_window_right) {
+      // Apply mask
+      constexpr logits_buffer_t neg_inf =
+          -std::numeric_limits<logits_buffer_t>::infinity();
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      int32_t curr_token_pos = q_tile_start_pos;
+      for (int32_t token_idx = 0; token_idx < q_token_num; ++token_idx) {
+        int32_t left_kv_pos = [&]() {
+          int32_t pos = kv_tile_start_pos;
+          if (sliding_window_left != -1) {
+            pos = std::max(pos, curr_token_pos - sliding_window_left);
+          }
+          return pos;
+        }();
+
+        int32_t right_kv_pos = [&]() {
+          int32_t pos = kv_tile_end_pos;
+          if (sliding_window_right != -1) {
+            pos = std::min(pos,
+                           std::max(kv_tile_start_pos,
+                                    curr_token_pos + sliding_window_right + 1));
+          }
+          return pos;
+        }();
+
+        int32_t left_invalid_token_num = left_kv_pos - kv_tile_start_pos;
+        int32_t right_invalid_token_num = kv_tile_end_pos - right_kv_pos;
+        for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+          logits_buffer_t* __restrict__ curr_logits_buffer_tail =
+              curr_logits_buffer + right_kv_pos - kv_tile_start_pos;
+          for (int32_t i = 0; i < left_invalid_token_num; ++i) {
+            curr_logits_buffer[i] = neg_inf;
+          }
+          for (int32_t i = 0; i < right_invalid_token_num; ++i) {
+            curr_logits_buffer_tail[i] = neg_inf;
+          }
+
+          curr_logits_buffer += logits_buffer_stride;
+        }
+
+        ++curr_token_pos;
+      }
+    }
+
+    void apply_softmax(logits_buffer_t* __restrict__ logits_buffer,
+                       float* __restrict__ partial_q_buffer,
+                       float* __restrict__ max_buffer,
+                       float* __restrict__ sum_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, bool is_first_iter,
+                       bool use_sink) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      using prob_buffer_vec_t = typename VecTypeTrait<prob_buffer_t>::vec_t;
+      static_assert(sizeof(prob_buffer_t) <= sizeof(logits_buffer_t));
+
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      float* __restrict__ curr_partial_q_buffer = partial_q_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      const int32_t head_vec_num = head_dim / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        float init_max_val = max_buffer[i];
+        float init_sum_val = sum_buffer[i];
+
+        // apply scale and compute max
+        vec_op::FP32Vec16 max_vec(init_max_val);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            max_vec = vec.max(max_vec);
+
+            curr_logits_buffer_iter += 16;
+          }
+        }
+        float new_max_val = max_vec.reduce_max();
+        float rescale_factor = init_max_val - new_max_val;
+
+        // use same rescale threshold with FA4.
+        // https://github.com/Dao-AILab/flash-attention/blob/1b8e1e641c6a179be9a0538b7f40fd595050b735/flash_attn/cute/flash_fwd_sm100.py#L1271
+        bool need_rescale = rescale_factor < -8.0;
+        if (!need_rescale) {
+          new_max_val = init_max_val;
+        } else {
+          max_buffer[i] = new_max_val;
+        }
+
+        // sub max, compute exp and sum
+        max_vec = vec_op::FP32Vec16(new_max_val);
+        vec_op::FP32Vec16 sum_vec(0.0);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          prob_buffer_t* __restrict__ curr_prob_buffer_iter =
+              reinterpret_cast<prob_buffer_t*>(curr_logits_buffer);
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec - max_vec;
+
+            // compute exp
+#ifdef DEFINE_FAST_EXP
+            vec = fast_exp(vec);
+            prob_buffer_vec_t output_vec(vec);
+            output_vec.save(curr_prob_buffer_iter);
+#else
+            vec.save(curr_logits_buffer_iter);
+            for (int32_t k = 0; k < 16; ++k) {
+              curr_logits_buffer_iter[k] = std::exp(curr_logits_buffer_iter[k]);
+            }
+            vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+
+            sum_vec = sum_vec + vec;
+
+            curr_logits_buffer_iter += 16;
+            curr_prob_buffer_iter += 16;
+          }
+        }
+        float new_sum_val = sum_vec.reduce_sum();
+
+        // rescale sum and partial outputs
+        if (need_rescale) {
+          // compute rescale factor
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          // rescale sum
+          new_sum_val += rescale_factor * init_sum_val;
+
+          // rescale output
+          if (!is_first_iter) {
+            float* __restrict__ curr_partial_q_buffer_iter =
+                curr_partial_q_buffer;
+            for (int32_t j = 0; j < head_vec_num; ++j) {
+              vec_op::FP32Vec16 vec(curr_partial_q_buffer_iter);
+              vec = vec * rescale_factor_vec;
+              vec.save(curr_partial_q_buffer_iter);
+
+              curr_partial_q_buffer_iter += 16;
+            }
+          }
+        } else {
+          new_sum_val += init_sum_val;
+        }
+
+        sum_buffer[i] = new_sum_val;
+
+        curr_logits_buffer += logits_buffer_stride;
+        curr_partial_q_buffer += head_dim;
+      }
+    }
+
+    void apply_softcap(logits_buffer_t* __restrict__ logits_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, float softcap_scale) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      float inv_softcap_scale = 1.0 / softcap_scale;
+      vec_op::FP32Vec16 softcap_scale_vec(softcap_scale);
+      vec_op::FP32Vec16 inv_softcap_scale_vec(inv_softcap_scale);
+      vec_op::FP32Vec16 ones_vec(1.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+            curr_logits_buffer;
+        for (int32_t j = 0; j < vec_num; ++j) {
+          vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+          vec = vec * inv_softcap_scale_vec;
+
+#ifdef DEFINE_FAST_EXP
+          vec = fast_exp(vec);
+          vec_op::FP32Vec16 inv_vec = ones_vec / vec;
+          vec = (vec - inv_vec) / (vec + inv_vec);
+#else
+          vec.save(curr_logits_buffer_iter);
+          for (int k = 0; k < 16; ++k) {
+            curr_logits_buffer_iter[k] = std::tanh(curr_logits_buffer_iter[k]);
+          }
+          vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+          vec = vec * softcap_scale_vec;
+          vec.save(curr_logits_buffer_iter);
+
+          curr_logits_buffer_iter += 16;
+        }
+
+        curr_logits_buffer += logits_buffer_stride;
+      }
+    }
+
+    void apply_alibi_slopes(logits_buffer_t* __restrict__ logits_buffer,
+                            const float* __restrict__ alibi_slopes,
+                            const int64_t logits_buffer_stride,
+                            const int32_t q_tile_start_pos,
+                            const int32_t kv_tile_start_pos,
+                            const int32_t q_token_num,
+                            const int32_t kv_tile_token_num,
+                            const int32_t q_heads_per_kv) {
+      alignas(64) constexpr float initial_arange_vals[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      const int32_t vec_num = kv_tile_token_num / 16;
+
+      vec_op::FP32Vec16 initial_arange_vals_vec(initial_arange_vals);
+      initial_arange_vals_vec =
+          initial_arange_vals_vec + vec_op::FP32Vec16((float)kv_tile_start_pos);
+      vec_op::FP32Vec16 pos_offset_vec(16.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      for (int32_t i = 0; i < q_token_num; ++i) {
+        vec_op::FP32Vec16 curr_q_pos_vec((float)(i + q_tile_start_pos));
+        for (int32_t j = 0; j < q_heads_per_kv; ++j) {
+          vec_op::FP32Vec16 alibi_scale_vec(alibi_slopes[j]);
+          vec_op::FP32Vec16 curr_kv_pos_vec(initial_arange_vals_vec);
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t k = 0; k < vec_num; ++k) {
+            vec_op::FP32Vec16 alibi_bias_vec =
+                alibi_scale_vec * (curr_kv_pos_vec - curr_q_pos_vec);
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec + alibi_bias_vec;
+
+            vec.save(curr_logits_buffer_iter);
+
+            curr_kv_pos_vec = curr_kv_pos_vec + pos_offset_vec;
+            curr_logits_buffer_iter += 16;
+          }
+          curr_logits_buffer += logits_buffer_stride;
+        }
+      }
+    }
+  };
+
+ public:
+  void operator()(const AttentionInput* input) {
+    const int thread_num = omp_get_max_threads();
+    TORCH_CHECK_EQ(input->metadata->thread_num, thread_num);
+    std::atomic<int32_t> guard_counter(0);
+    std::atomic<int32_t>* guard_counter_ptr = &guard_counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int thread_id = 0; thread_id < thread_num; ++thread_id) {
+      AttentionMetadata& metadata = *input->metadata;
+      if (metadata.workitem_group_num == 0) {
+        continue;
+      }
+
+      attention_impl_t attn_impl;
+
+      // general information
+      const int32_t q_head_num = input->num_heads;
+      const int32_t kv_head_num = input->num_kv_heads;
+      const int32_t q_heads_per_kv = q_head_num / kv_head_num;
+      const bool use_gqa =
+          (max_q_head_num_per_iter % q_heads_per_kv == 0) ? true : false;
+      const int32_t actual_kv_head_num = use_gqa ? kv_head_num : q_head_num;
+      const int32_t actual_q_heads_per_kv = use_gqa ? q_heads_per_kv : 1;
+      TORCH_CHECK_LE(actual_q_heads_per_kv, max_q_head_num_per_iter);
+      const int32_t max_q_token_num_per_iter =
+          max_q_head_num_per_iter / actual_q_heads_per_kv;
+      const int64_t q_token_num_stride = input->query_num_tokens_stride;
+      const int64_t q_head_num_stride = input->query_num_heads_stride;
+      const int64_t kv_cache_head_num_stride = input->cache_num_kv_heads_stride;
+      const int64_t kv_cache_block_num_stride = input->cache_num_blocks_stride;
+      const int32_t sliding_window_left = input->sliding_window_left;
+      const int32_t sliding_window_right = input->sliding_window_right;
+      const int32_t block_size = input->block_size;
+      const float scale = input->scale;
+      const float softcap_scale = input->softcap;
+      const float* alibi_slopes = input->alibi_slopes;
+      const c10::BFloat16* s_aux = input->s_aux;
+
+      const bool casual = input->causal;
+      int32_t* const block_table = input->block_table;
+      const int64_t block_table_stride = input->blt_num_tokens_stride;
+
+      // init buffers
+      void* scratchpad_ptr =
+          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+              ->get_data<void>();
+      AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
+
+      const int32_t total_reduction_split_num = metadata.reduction_split_num;
+      if (metadata.reduction_split_num > 0) {
+        // reset split flag
+        for (int32_t head_idx = thread_id; head_idx < actual_kv_head_num;
+             head_idx += thread_num) {
+          buffer_manager.update(head_idx, total_reduction_split_num, head_dim,
+                                0, sizeof(partial_output_buffer_t));
+          volatile bool* __restrict__ curr_flag_ptr =
+              buffer_manager.get_reduce_flag_buffer();
+          for (int32_t split_idx = 0; split_idx < total_reduction_split_num;
+               ++split_idx) {
+            curr_flag_ptr[split_idx] = false;
+          }
+        }
+      }
+
+      const int64_t available_cache_size =
+          AttentionScheduler::get_available_l2_size();
+      const int32_t default_tile_size =
+          AttentionScheduler::calcu_default_tile_size(
+              available_cache_size, head_dim, sizeof(kv_cache_t),
+              sizeof(q_buffer_t), sizeof(logits_buffer_t),
+              sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+              max_q_head_num_per_iter);
+      const int32_t default_q_tile_token_num =
+          default_tile_size / actual_q_heads_per_kv;
+
+      AttentionWorkItemGroup* const workitem_groups =
+          metadata.workitem_groups_ptr;
+      const int32_t* cu_workitem_num_per_thread =
+          metadata.cu_workitem_num_per_thread;
+      ReductionWorkItemGroup* const reduction_items =
+          metadata.reduction_items_ptr;
+
+      const int32_t effective_thread_num = metadata.effective_thread_num;
+      const int32_t reduction_item_num = metadata.reduction_item_num;
+      const int32_t split_kv_q_token_num_threshold =
+          metadata.split_kv_q_token_num_threshold;
+      const int32_t workitem_groups_counter_num =
+          actual_kv_head_num * effective_thread_num;
+      const int32_t reduction_items_counter_num =
+          actual_kv_head_num * reduction_item_num;
+      const int32_t total_counter_num =
+          workitem_groups_counter_num + reduction_items_counter_num;
+
+      if (metadata.reduction_split_num > 0) {
+        ++(*guard_counter_ptr);
+        while (guard_counter_ptr->load() != thread_num) {
+#ifdef FAST_SPINNING
+          FAST_SPINNING
+#else
+          std::this_thread::yield();
+#endif
+        }
+      }
+
+      // main loop
+      for (;;) {
+        int64_t task_idx = metadata.acquire_counter();
+
+        if (task_idx >= total_counter_num) {
+          // no more tasks, leave loop
+          break;
+        }
+
+        if (task_idx < workitem_groups_counter_num) {
+          // attention task
+          // map task_idx to workitem_groups
+          const int32_t kv_head_idx = task_idx / effective_thread_num;
+          const int32_t thread_offset = task_idx % effective_thread_num;
+          AttentionWorkItemGroup* const curr_workitem_groups =
+              workitem_groups + cu_workitem_num_per_thread[thread_offset];
+          const int32_t curr_workitem_groups_num =
+              cu_workitem_num_per_thread[thread_offset + 1] -
+              cu_workitem_num_per_thread[thread_offset];
+
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+
+          for (int32_t workitem_group_idx = 0;
+               workitem_group_idx < curr_workitem_groups_num;
+               ++workitem_group_idx) {
+            AttentionWorkItemGroup* const current_workitem_group =
+                &curr_workitem_groups[workitem_group_idx];
+
+            const int32_t current_group_idx = current_workitem_group->req_id;
+            const int32_t kv_start_pos =
+                current_workitem_group->kv_split_pos_start;
+            const int32_t kv_end_pos = current_workitem_group->kv_split_pos_end;
+            const int32_t curr_spilt_id = current_workitem_group->split_id;
+            const int32_t q_token_id_start =
+                current_workitem_group->q_token_id_start;
+            const int32_t q_token_num = current_workitem_group->q_token_num;
+
+            // taskgroup general information
+            const int32_t q_end = input->query_start_loc[current_group_idx + 1];
+            const int32_t q_start = input->query_start_loc[current_group_idx];
+            const int32_t seq_len = input->seq_lens[current_group_idx];
+            const int32_t q_start_pos =
+                (casual ? seq_len - (q_end - q_start) : 0);
+            const int32_t block_num = (seq_len + block_size - 1) / block_size;
+            // Only apply sink for the first KV split
+            bool use_sink = (s_aux != nullptr &&
+                             current_workitem_group->local_split_id == 0);
+
+            for (int32_t q_token_offset = 0; q_token_offset < q_token_num;
+                 q_token_offset += default_q_tile_token_num) {
+              bool first_iter_flag[AttentionScheduler::MaxQTileIterNum];
+              for (int32_t i = 0; i < AttentionScheduler::MaxQTileIterNum;
+                   ++i) {
+                first_iter_flag[i] = true;
+              }
+
+              const int32_t q_token_start_idx =
+                  q_start + q_token_offset + q_token_id_start;
+              const int32_t actual_q_token_num = std::min(
+                  default_q_tile_token_num, q_token_num - q_token_offset);
+              const int32_t q_head_tile_size =
+                  actual_q_token_num * actual_q_heads_per_kv;
+              const int32_t rounded_q_head_tile_size =
+                  ((q_head_tile_size + max_q_head_num_per_iter - 1) /
+                   max_q_head_num_per_iter) *
+                  max_q_head_num_per_iter;
+              const int32_t kv_tile_size =
+                  AttentionScheduler::calcu_tile_size_with_constant_q(
+                      available_cache_size, head_dim, sizeof(kv_cache_t),
+                      sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                      sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                      blocksize_alignment, rounded_q_head_tile_size,
+                      rounded_q_head_tile_size <= max_q_head_num_per_iter);
+
+              // update buffers
+              buffer_manager.update(
+                  head_dim, sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                  sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                  rounded_q_head_tile_size, kv_tile_size);
+              q_buffer_t* q_buffer = buffer_manager.get_q_buffer<q_buffer_t>();
+              float* logits_buffer = buffer_manager.get_logits_buffer();
+              float* partial_q_buffer = buffer_manager.get_output_buffer();
+              float* max_buffer = buffer_manager.get_max_buffer();
+              float* sum_buffer = buffer_manager.get_sum_buffer();
+
+              const int32_t q_tile_start_pos =
+                  q_start_pos + q_token_offset + q_token_id_start;
+              const int32_t q_tile_end_pos =
+                  q_tile_start_pos + actual_q_token_num;
+              const auto [kv_tile_start_pos, kv_tile_end_pos] =
+                  AttentionScheduler::calcu_kv_tile_pos(
+                      kv_start_pos, kv_end_pos, q_tile_start_pos,
+                      q_tile_end_pos, sliding_window_left,
+                      sliding_window_right);
+              const auto [rounded_kv_tile_start_pos, rounded_kv_tile_end_pos] =
+                  AttentionScheduler::align_kv_tile_pos(
+                      kv_tile_start_pos, kv_tile_end_pos, blocksize_alignment);
+
+              int32_t curr_kv_head_idx =
+                  use_gqa ? kv_head_idx
+                          : (kv_head_idx /
+                             q_heads_per_kv);  // for GQA disabled case
+
+              // std::printf("thread_id: %d, req_id: %d, q_token_start: %d,
+              // q_token_end: %d, q_head_start: %d, q_head_end: %d, kv_head_idx:
+              // %d, kv_pos_start: %d, kv_pos_end: %d\n",
+              //                 thread_id, current_group_idx,
+              //                 q_token_start_idx, q_token_start_idx +
+              //                 actual_q_token_num, q_head_start_idx,
+              //                 q_head_start_idx + actual_q_heads_per_kv,
+              //                 curr_kv_head_idx, kv_tile_start_pos,
+              //                 kv_tile_end_pos);
+
+              // move buffers
+              kv_cache_t* curr_k_cache =
+                  reinterpret_cast<kv_cache_t*>(input->key_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              kv_cache_t* curr_v_cache =
+                  reinterpret_cast<kv_cache_t*>(input->value_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              query_t* const q_tile_ptr =
+                  reinterpret_cast<query_t*>(input->query) +
+                  q_token_start_idx * q_token_num_stride +
+                  q_head_start_idx * q_head_num_stride;
+              size_t output_buffer_offset =
+                  q_token_start_idx * q_head_num * head_dim +
+                  q_head_start_idx * head_dim;
+              int32_t* curr_block_table =
+                  block_table + current_group_idx * block_table_stride;
+              const float* curr_alibi_slopes =
+                  (alibi_slopes != nullptr ? alibi_slopes + q_head_start_idx
+                                           : nullptr);
+              const c10::BFloat16* curr_s_aux =
+                  (s_aux != nullptr ? s_aux + q_head_start_idx : nullptr);
+
+              // copy the Q tile to q_buffer, the logical layout of q_buffer is
+              // [actual_q_token_num, actual_q_heads_per_kv, head_dim]
+              {
+                attn_impl.copy_q_heads_tile(
+                    q_tile_ptr, q_buffer, actual_q_token_num,
+                    actual_q_heads_per_kv, q_token_num_stride,
+                    q_head_num_stride, scale);
+              }
+
+              if (use_sink) {
+                alignas(64) float s_aux_fp32[16];
+                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
+                vec_op::FP32Vec16 vec_fp32(vec_bf16);
+                vec_fp32.save(s_aux_fp32);
+
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 1.0f;
+                    curr_max_buffer[head_idx] = s_aux_fp32[head_idx];
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              } else {
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 0.0f;
+                    curr_max_buffer[head_idx] =
+                        std::numeric_limits<float>::lowest();
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              }
+
+              // compute loop
+              for (int32_t kv_tile_pos = rounded_kv_tile_start_pos;
+                   kv_tile_pos < rounded_kv_tile_end_pos;
+                   kv_tile_pos += kv_tile_size) {
+                const int32_t kv_tile_pos_left = kv_tile_pos;
+                const int32_t kv_tile_pos_right = std::min(
+                    kv_tile_pos_left + kv_tile_size, rounded_kv_tile_end_pos);
+                for (int32_t q_head_tile_token_offset = 0;
+                     q_head_tile_token_offset < actual_q_token_num;
+                     q_head_tile_token_offset += max_q_token_num_per_iter) {
+                  const int32_t q_tile_pos_left =
+                      q_tile_start_pos + q_head_tile_token_offset;
+                  const int32_t q_tile_token_num =
+                      std::min(max_q_token_num_per_iter,
+                               actual_q_token_num - q_head_tile_token_offset);
+                  const int32_t q_tile_head_offset =
+                      q_head_tile_token_offset * actual_q_heads_per_kv;
+                  const int32_t q_tile_head_num =
+                      q_tile_token_num * actual_q_heads_per_kv;
+                  const int32_t q_tile_pos_right =
+                      q_tile_pos_left + q_tile_token_num;
+                  const auto [actual_kv_tile_pos_left,
+                              actual_kv_tile_pos_right] =
+                      AttentionScheduler::calcu_kv_tile_pos(
+                          kv_tile_pos_left, kv_tile_pos_right, q_tile_pos_left,
+                          q_tile_pos_right, sliding_window_left,
+                          sliding_window_right);
+                  const int32_t q_iter_idx =
+                      q_head_tile_token_offset / max_q_token_num_per_iter;
+
+                  if (actual_kv_tile_pos_right <= actual_kv_tile_pos_left) {
+                    continue;
+                  }
+
+                  // align kv_pos to blocksize_alignment
+                  const auto [aligned_actual_kv_tile_pos_left,
+                              aligned_actual_kv_tile_pos_right] =
+                      AttentionScheduler::align_kv_tile_pos(
+                          actual_kv_tile_pos_left, actual_kv_tile_pos_right,
+                          blocksize_alignment);
+                  const int32_t actual_kv_token_num =
+                      aligned_actual_kv_tile_pos_right -
+                      aligned_actual_kv_tile_pos_left;
+
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,
+                  //   q_token_end: %d, q_token_num: %d, q_head_num: %d,
+                  //   q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,
+                  //   kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx +
+                  //             q_head_tile_token_offset + q_tile_token_num,
+                  //             q_tile_token_num, q_tile_head_num,
+                  //             q_tile_pos_left, q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+
+                  // Move buffers
+                  q_buffer_t* curr_q_heads_buffer =
+                      q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_partial_q_buffer =
+                      partial_q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_max_buffer = max_buffer + q_tile_head_offset;
+                  float* curr_sum_buffer = sum_buffer + q_tile_head_offset;
+
+                  bool debug_info = false;
+                  //   bool debug_info = (
+                  //     q_head_start_idx == 4 &&
+                  //     (q_token_start_idx + q_head_tile_token_offset) <=
+                  //     4
+                  //     && (q_token_start_idx + q_head_tile_token_offset +
+                  //     q_tile_token_num) > 4
+                  //   );
+                  // if (debug_info) {
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,"
+                  //   "q_token_end: %d, q_token_num: %d, q_head_num: %d,"
+                  //   "q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,"
+                  //   "kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx
+                  //             + q_head_tile_token_offset +
+                  //             q_tile_token_num, q_tile_token_num,
+                  //             q_tile_head_num, q_tile_pos_left,
+                  //             q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+                  // }
+
+                  attn_impl.template execute_attention<Attention>(
+                      curr_q_heads_buffer, curr_k_cache, curr_v_cache,
+                      logits_buffer, curr_partial_q_buffer, curr_max_buffer,
+                      curr_sum_buffer, curr_block_table,
+                      aligned_actual_kv_tile_pos_left,
+                      aligned_actual_kv_tile_pos_right, actual_kv_token_num,
+                      kv_cache_block_num_stride, q_tile_head_num,
+                      q_tile_token_num, q_tile_pos_left, actual_q_heads_per_kv,
+                      block_size, sliding_window_left, sliding_window_right,
+                      scale, softcap_scale, curr_alibi_slopes,
+                      first_iter_flag[q_iter_idx], use_sink, debug_info);
+                  first_iter_flag[q_iter_idx] = false;
+                }
+              }
+
+              // write back partial results to output buffer or reduction buffer
+              {
+                if (curr_spilt_id == -1) {
+                  final_output(partial_q_buffer,
+                               reinterpret_cast<query_t*>(input->output) +
+                                   output_buffer_offset,
+                               sum_buffer, actual_q_heads_per_kv,
+                               actual_q_token_num, q_head_num);
+                } else {
+                  const int32_t stride =
+                      actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+                  buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                        head_dim, stride, sizeof(float));
+                  volatile bool* split_flag_buffer =
+                      buffer_manager.get_reduce_flag_buffer() + curr_spilt_id;
+                  float* split_output_buffer =
+                      buffer_manager.get_reduce_output_buffer() +
+                      curr_spilt_id * stride * head_dim;
+                  float* split_max_buffer =
+                      buffer_manager.get_reduce_max_buffer() +
+                      curr_spilt_id * stride;
+                  float* split_sum_buffer =
+                      buffer_manager.get_reduce_sum_buffer() +
+                      curr_spilt_id * stride;
+
+                  partial_output(partial_q_buffer, max_buffer, sum_buffer,
+                                 q_head_tile_size, split_output_buffer,
+                                 split_max_buffer, split_sum_buffer,
+                                 split_flag_buffer);
+                }
+              }
+            }
+          }
+        } else {
+          task_idx -= workitem_groups_counter_num;
+          const int32_t kv_head_idx = task_idx / reduction_item_num;
+          const int32_t item_offset = task_idx % reduction_item_num;
+          ReductionWorkItemGroup* const curr_workitem_groups =
+              reduction_items + item_offset;
+          const int32_t curr_output_token_idx =
+              curr_workitem_groups->q_token_id_start;
+          const int32_t curr_output_token_num =
+              curr_workitem_groups->q_token_id_num;
+          const int32_t curr_split_id = curr_workitem_groups->split_start_id;
+          const int32_t curr_split_num = curr_workitem_groups->split_num;
+          const int32_t current_group_idx = curr_workitem_groups->req_id;
+          const int32_t curr_output_head_num =
+              curr_output_token_num * actual_q_heads_per_kv;
+
+          const int32_t q_start = input->query_start_loc[current_group_idx];
+          const int32_t q_token_start_idx = q_start + curr_output_token_idx;
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+          size_t output_buffer_offset =
+              q_token_start_idx * q_head_num * head_dim +
+              q_head_start_idx * head_dim;
+
+          const int32_t stride =
+              actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+          buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                head_dim, stride, sizeof(float));
+          volatile bool* split_flag_buffer =
+              buffer_manager.get_reduce_flag_buffer() + curr_split_id;
+          float* split_output_buffer =
+              buffer_manager.get_reduce_output_buffer() +
+              curr_split_id * stride * head_dim;
+          float* split_max_buffer =
+              buffer_manager.get_reduce_max_buffer() + curr_split_id * stride;
+          float* split_sum_buffer =
+              buffer_manager.get_reduce_sum_buffer() + curr_split_id * stride;
+
+          reduce_splits(split_output_buffer, split_max_buffer, split_sum_buffer,
+                        split_flag_buffer, stride, curr_output_head_num,
+                        curr_split_num);
+          final_output(
+              split_output_buffer,
+              reinterpret_cast<query_t*>(input->output) + output_buffer_offset,
+              split_sum_buffer, actual_q_heads_per_kv, curr_output_token_num,
+              q_head_num);
+        }
+      }
+    }
+    // Reset counter for next call
+    input->metadata->reset_counter();
+  }
+
+  void reduce_splits(float* __restrict__ split_output_buffer,
+                     float* __restrict__ split_max_buffer,
+                     float* __restrict__ split_sum_buffer,
+                     volatile bool* __restrict__ flags,
+                     const int32_t head_num_per_split,
+                     const int32_t curr_head_num, const int32_t split_num) {
+#ifdef DEFINE_FAST_EXP
+    DEFINE_FAST_EXP
+#endif
+    // restrict curr_head_num <= 16 in the scheduler
+    // elems in split_max_buffer, split_sum_buffer are not cache alignment, use
+    // local buffers to reduce false-sharing
+    alignas(64) float local_max[16];
+    alignas(64) float local_sum[16];
+
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    float* __restrict__ curr_split_max_buffer = split_max_buffer;
+    float* __restrict__ curr_split_sum_buffer = split_sum_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t split_idx = 0; split_idx < split_num; ++split_idx) {
+      while (!flags[split_idx]) {
+#ifdef FAST_SPINNING
+        FAST_SPINNING
+#else
+        std::this_thread::yield();
+#endif
+      }
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (split_idx > 0) {
+        float* __restrict__ curr_output_buffer = split_output_buffer;
+        float* __restrict__ curr_split_output_buffer_iter =
+            curr_split_output_buffer;
+        for (int32_t head_idx = 0; head_idx < curr_head_num; ++head_idx) {
+          float final_max = local_max[head_idx];
+          float curr_max = curr_split_max_buffer[head_idx];
+          float final_sum = local_sum[head_idx];
+          float curr_sum = curr_split_sum_buffer[head_idx];
+          float* __restrict__ non_scale_output_iter =
+              final_max > curr_max ? curr_output_buffer
+                                   : curr_split_output_buffer_iter;
+          float* __restrict__ scale_output_iter =
+              final_max > curr_max ? curr_split_output_buffer_iter
+                                   : curr_output_buffer;
+          float rescale_factor = final_max > curr_max ? curr_max - final_max
+                                                      : final_max - curr_max;
+
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          local_sum[head_idx] = final_max > curr_max
+                                    ? final_sum + rescale_factor * curr_sum
+                                    : rescale_factor * final_sum + curr_sum;
+
+          final_max = std::max(final_max, curr_max);
+          local_max[head_idx] = final_max;
+          for (int32_t i = 0; i < head_dim_group_num; ++i) {
+            vec_op::FP32Vec16 non_scale_vec(non_scale_output_iter);
+            vec_op::FP32Vec16 scale_vec(scale_output_iter);
+            vec_op::FP32Vec16 final_vec =
+                non_scale_vec + scale_vec * rescale_factor_vec;
+            final_vec.save(curr_output_buffer);
+
+            non_scale_output_iter += 16;
+            scale_output_iter += 16;
+            curr_output_buffer += 16;
+          }
+          curr_split_output_buffer_iter += head_dim;
+        }
+      } else {
+        vec_op::FP32Vec16 final_max(split_max_buffer);
+        final_max.save(local_max);
+        vec_op::FP32Vec16 final_sum(split_sum_buffer);
+        final_sum.save(local_sum);
+      }
+
+      curr_split_output_buffer += head_num_per_split * head_dim;
+      curr_split_max_buffer += head_num_per_split;
+      curr_split_sum_buffer += head_num_per_split;
+    }
+    // write back final max and sum
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = local_max[i];
+      split_sum_buffer[i] = local_sum[i];
+    }
+  }
+
+  void partial_output(float* __restrict__ partial_output_buffer,
+                      float* __restrict__ partial_max_buffer,
+                      float* __restrict__ partial_sum_buffer,
+                      int32_t curr_head_num,
+                      float* __restrict__ split_output_buffer,
+                      float* __restrict__ split_max_buffer,
+                      float* __restrict__ split_sum_buffer,
+                      volatile bool* __restrict__ flag) {
+    float* __restrict__ curr_partial_output_buffer = partial_output_buffer;
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = partial_max_buffer[i];
+      split_sum_buffer[i] = partial_sum_buffer[i];
+      for (int32_t j = 0; j < head_dim_group_num; ++j) {
+        vec_op::FP32Vec16 vec(curr_partial_output_buffer);
+        vec.save(curr_split_output_buffer);
+
+        curr_partial_output_buffer += 16;
+        curr_split_output_buffer += 16;
+      }
+    }
+    std::atomic_thread_fence(std::memory_order_release);
+    *flag = true;
+  }
+
+  void final_output(float* __restrict__ partial_q_buffer,
+                    query_t* __restrict__ curr_output_buffer,
+                    float* __restrict__ sum_buffer,
+                    const int32_t q_heads_per_kv,
+                    const int32_t actual_q_token_num,
+                    const int32_t q_head_num) {
+    // final output
+    using output_vec_t = typename VecTypeTrait<query_t>::vec_t;
+
+    float* __restrict__ curr_partial_output_buffer = partial_q_buffer;
+    float* __restrict__ curr_sum_buffer = sum_buffer;
+    constexpr int32_t group_num_per_head = head_dim / 16;
+    const int32_t partial_q_buffer_stride = q_heads_per_kv * head_dim;
+    const int32_t output_buffer_stride = q_head_num * head_dim;
+    for (int32_t token_idx = 0; token_idx < actual_q_token_num; ++token_idx) {
+      float* __restrict__ curr_partial_output_buffer_iter =
+          curr_partial_output_buffer;
+      query_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+      for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+        vec_op::FP32Vec16 inv_sum_scale_vec(1.0 / *curr_sum_buffer);
+
+        for (int32_t i = 0; i < group_num_per_head; ++i) {
+          vec_op::FP32Vec16 vec(curr_partial_output_buffer_iter);
+          // divide the final sum val of softmax here
+          vec = inv_sum_scale_vec * vec;
+
+          // cast to query type
+          output_vec_t output_vec(vec);
+          output_vec.save(curr_output_buffer_iter);
+
+          // update
+          curr_partial_output_buffer_iter += 16;
+          curr_output_buffer_iter += 16;
+        }
+
+        // update
+        curr_sum_buffer += 1;
+      }
+
+      // update
+      curr_partial_output_buffer += partial_q_buffer_stride;
+      curr_output_buffer += output_buffer_stride;
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
new file mode 100644
index 0000000000000..6458e43419370
--- /dev/null
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -0,0 +1,63 @@
+#ifndef CPU_ATTN_MACROS_H
+#define CPU_ATTN_MACROS_H
+
+// x86_64
+#ifdef __x86_64__
+  #define FAST_SPINNING _mm_pause();
+
+  #ifdef __AVX512F__
+    #define DEFINE_FAST_EXP                                                    \
+      const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f);             \
+      const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f);             \
+      const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f);             \
+      const __m512 vec_factorial_4 = _mm512_set1_ps(0.0418978221f);            \
+      const __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f);           \
+      const __m512 vec_exp_log2ef =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b));                  \
+      const __m512 vec_half = _mm512_set1_ps(0.5f);                            \
+      const __m512 vec_one = _mm512_set1_ps(1.f);                              \
+      const __m512 vec_zero = _mm512_set1_ps(0.f);                             \
+      const __m512 vec_two = _mm512_set1_ps(2.f);                              \
+      const __m512 vec_ln2f =                                                  \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218));                  \
+      const __m512 vec_ln_flt_min =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));                  \
+      const __m512 vec_ln_flt_max =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
+      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
+      const int n_mantissa_bits = 23;                                          \
+      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+                          always_inline)) {                                    \
+        __m512 values = vec.reg;                                               \
+        auto less_ln_flt_min_mask =                                            \
+            _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);      \
+        auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);                  \
+        vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);                      \
+        auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);      \
+        auto vec_fx_i = _mm512_cvt_roundps_epi32(                              \
+            vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);                \
+        vec_fx = _mm512_cvtepi32_ps(vec_fx_i);                                 \
+        auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);       \
+        auto vec_res =                                                         \
+            _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);   \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);             \
+        auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);                  \
+        auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);            \
+        auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);    \
+        vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); \
+        auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);             \
+        vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask,             \
+                                             vec_two_pow_n, vec_zero);         \
+        vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);                       \
+        vec_res = _mm512_mul_ps(vec_res, vec_two);                             \
+        vec_op::FP32Vec16 res(vec_res);                                        \
+        return res;                                                            \
+      };
+  #endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
new file mode 100644
index 0000000000000..479313f0e19f4
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -0,0 +1,248 @@
+#ifndef CPU_ATTN_VEC_HPP
+#define CPU_ATTN_VEC_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename kv_cache_t>
+class TileGemm82 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
+    float* __restrict__ curr_c_0 = c_tile;
+    float* __restrict__ curr_c_1 = c_tile + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+      curr_b_1 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      32;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      32;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 8;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm82<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key as column-major
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value as row-major
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_vec16.hpp b/csrc/cpu/cpu_attn_vec16.hpp
new file mode 100644
index 0000000000000..7402312c09243
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -0,0 +1,171 @@
+#ifndef CPU_ATTN_VEC16_HPP
+#define CPU_ATTN_VEC16_HPP
+
+#include "cpu_attn_vec.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 16-1-16 pattern, 16 regs for A, 1 regs for B, 16 regs for C, [16, K] @ [k,
+// 16]
+template <typename kv_cache_t>
+class TileGemm161 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 9:
+      case 10:
+      case 11:
+      case 12:
+        gemm_micro<12>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+      case 13:
+      case 14:
+      case 15:
+      case 16:
+        gemm_micro<16>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 16);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    float* __restrict__ curr_c_0 = c_tile;
+
+    vec_op::FP32Vec16 c_regs[M];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i] = vec_op::FP32Vec16(curr_m_c_0);
+
+        // update
+        curr_m_c_0 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i] = c_regs[i] + a_reg * fp32_b_0_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i].save(curr_c_0);
+
+      // update
+      curr_c_0 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      16;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      16;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 16;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC16;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm161<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 982f7c07a13bd..7ddf028e6e131 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -40,6 +40,23 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
+// Function to get the timestamp using RDTSCP
+FORCE_INLINE uint64_t bench_timestamp() {
+  unsigned int cycles_low, cycles_high;
+  asm volatile(
+      ".intel_syntax noprefix\n\t"
+      "CPUID\n\t"        // Serialize instruction stream to ensure previous
+                         // instructions complete
+      "RDTSCP\n\t"       // Read TSC and core ID
+      "mov %0, edx\n\t"  // Store high 32 bits of TSC
+      "mov %1, eax\n\t"  // Store low 32 bits of TSC
+      ".att_syntax"
+      : "=r"(cycles_high), "=r"(cycles_low)::"rax", "rbx", "rcx",
+        "rdx"  // Clobbered registers
+  );
+  return (uint64_t)cycles_high << 32 | cycles_low;
+}
+
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
@@ -407,6 +424,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float reduce_min() const { return _mm512_reduce_min_ps(reg); }
 
+  float get_last_elem() const { return _mm512_cvtss_f32(reg); }
+
   template <int group_size>
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
@@ -446,9 +465,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
 
-  explicit FP32Vec16(const FP32Vec16& data)
-      : reg_low(data.reg_low), reg_high(data.reg_high) {}
-
   explicit FP32Vec16(const FP32Vec4& data)
       : reg_low((__m256)_mm256_inserti128_si256(
             _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
@@ -504,6 +520,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                      _mm256_div_ps(reg_high, b.reg_high));
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_max_ps(reg_low, b.reg_low),
+                     _mm256_max_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_max() const {
+    __m256 v = _mm256_max_ps(reg_low, reg_high);
+    // Permute to compare elements within 128-bit lanes
+    __m256 v_shuffled = _mm256_permute_ps(
+        v, 0b00001011);  // Swap halves within each 128-bit lane
+    __m256 v_max = _mm256_max_ps(v, v_shuffled);
+
+    v_shuffled = _mm256_permute_ps(
+        v_max, 0b00000001);  // Shuffle elements within each 128-bit lane
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // Permute to compare elements between 128-bit lanes
+    v_shuffled =
+        _mm256_permute2f128_ps(v_max, v_max, 0b00000001);  // Swap 128-bit lanes
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // At this point, the maximum value is present in all elements of v_max.
+    // Extract the first element for the scalar result.
+    return _mm256_cvtss_f32(v_max);  // Extract the lowest 32-bit float
+  }
+
   float reduce_sum() const {
     FP32Vec8 low = FP32Vec8(reg_low);
     FP32Vec8 high = FP32Vec8(reg_high);
@@ -642,7 +684,7 @@ inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
     : reg(_mm256_insertf128_si256(
           _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
-          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+          FP16Vec8(FP32Vec8(v.reg_high)).reg, 1)) {}
 #endif
 
 #ifdef __AVX512BF16__
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index bb43aeee2eafe..02a8072ccf306 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -5,6 +5,7 @@
 #include "common/memory.hpp"
 
 #include "dnnl_helper.h"
+#include "scratchpad_manager.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -22,23 +23,6 @@ void release_dnnl_matmul_handler(int64_t handler) {
   delete ptr;
 }
 
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
-
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
  public:
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 58ffe7a19bd4f..b841ed73058e9 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,30 +59,6 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
 }
 
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
 class DNNLMatMulPrimitiveHandler {
  public:
   virtual ~DNNLMatMulPrimitiveHandler() = default;
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
new file mode 100644
index 0000000000000..05cd435f34b7a
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.cpp
@@ -0,0 +1,23 @@
+#include <cstdlib>
+
+#include "scratchpad_manager.h"
+
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
new file mode 100644
index 0000000000000..0ecf59192f845
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.h
@@ -0,0 +1,31 @@
+#ifndef SCRATCHPAD_MANAGER_H
+#define SCRATCHPAD_MANAGER_H
+
+#include <cstddef>
+#include <cstdio>
+
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
+#endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
index 7e64e1c521980..e43aa203777ab 100644
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -192,7 +192,7 @@ class SHMManager {
                       const int group_size)
       : _rank(rank),
         _group_size(group_size),
-        _thread_num(torch::get_num_threads()),
+        _thread_num(omp_get_max_threads()),
         _shm_names({""}),
         _shared_mem_ptrs({nullptr}),
         _shm_ctx(nullptr) {
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 9df19d1ac3928..5e2aa70692566 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -74,25 +74,35 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                      const std::optional<at::Tensor>& bias,
                                      at::ScalarType out_dtype, bool is_vnni);
 
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split);
+
+void cpu_attn_reshape_and_cache(const torch::Tensor& key,
+                                const torch::Tensor& value,
+                                torch::Tensor& key_cache,
+                                torch::Tensor& value_cache,
+                                const torch::Tensor& slot_mapping,
+                                const std::string& isa);
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query, const torch::Tensor& key_cache,
+    const torch::Tensor& value_cache, torch::Tensor& output,
+    const torch::Tensor& query_start_loc, const torch::Tensor& seq_lens,
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table, const double softcap,
+    const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
   ops.def(
       "dynamic_4bit_int_moe("
       "Tensor x, Tensor topk_ids, Tensor topk_weights,"
@@ -102,20 +112,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
 
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
   // Activation ops
 
   // Activation function used in SwiGLU.
@@ -259,37 +255,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
            &int8_scaled_mm_with_quant);
 #endif
-}
 
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  Tensor k_scale, Tensor v_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-
-  cache_ops.def(
-      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                     Tensor! kv_cache,"
-      "                     Tensor slot_mapping,"
-      "                     str kv_cache_dtype,"
-      "                     Tensor scale) -> ()");
-  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
+  // CPU attention kernels
+  ops.def(
+      "get_scheduler_metadata(int num_req, int num_heads_q, int num_heads_kv, "
+      "int head_dim, Tensor seq_lens, ScalarType dtype, Tensor "
+      "query_start_loc, bool casual, int window_size, str isa_hint, bool "
+      "enable_kv_split) -> Tensor",
+      &get_scheduler_metadata);
+  ops.def(
+      "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
+      "isa) -> ()",
+      &cpu_attn_reshape_and_cache);
+  ops.def(
+      "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
+      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
+      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
+      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
+      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
+      &cpu_attention_with_kv_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 02514edce8073..c5a48352e3089 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,6 +45,16 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    // Verify all CPUs are on the same NUMA node
+    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
+      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
+      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
+                  " is on NUMA node ", node_id, ", but CPU ",
+                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                  ". All CPUs should be on the same NUMA node for optimal "
+                  "performance. Memory will be bound to NUMA node ",
+                  mem_node_id, ".");
+    }
     bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
     bitmask* src_mask = numa_get_membind();
 
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index fab6ca36d422e..78dc840a98b67 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -3,14 +3,58 @@
 // need to be unsigned long long
 #include <iostream>
 
+#include "cumem_allocator_compat.h"
+
+#ifndef USE_ROCM
+static const char* PYARGS_PARSE = "KKKK";
+#else
+  #include <cstdlib>
+  #include <cerrno>
+  #include <climits>
+
+// Default chunk size 256MB for ROCm. Can be overridden at runtime by the
+// environment variable VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE, specified in megabytes
+// (MB). The env value is parsed with strtoull as an integer number of MB
+// (decimal or 0x hex). The parsed MB value is converted to bytes. If
+// parsing fails, the value is 0, or the multiplication would overflow,
+// the default (256MB) is used.
+static const unsigned long long DEFAULT_MEMCREATE_CHUNK_SIZE =
+    (256ULL * 1024ULL * 1024ULL);
+
+static unsigned long long get_memcreate_chunk_size() {
+  const char* env = getenv("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE");
+  if (!env) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  char* endptr = nullptr;
+  errno = 0;
+  unsigned long long val_mb = strtoull(env, &endptr, 0);
+  if (endptr == env || errno != 0) {
+    // parsing failed, fallback to default
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  if (val_mb == 0) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+
+  const unsigned long long MB = 1024ULL * 1024ULL;
+  // guard against overflow when converting MB -> bytes
+  if (val_mb > (ULLONG_MAX / MB)) {
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  return val_mb * MB;
+}
+
+static inline unsigned long long my_min(unsigned long long a,
+                                        unsigned long long b) {
+  return a < b ? a : b;
+}
+
+static const char* PYARGS_PARSE = "KKKO";
+#endif
+
 extern "C" {
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include <sys/types.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
 
 char error_msg[10240];  // 10KB buffer to store error messages
 CUresult no_error = CUresult(0);
@@ -49,7 +93,12 @@ void ensure_context(unsigned long long device) {
 }
 
 void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+#ifndef USE_ROCM
                     CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                    CUmemGenericAllocationHandle** p_memHandle,
+                    unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   ensure_context(device);
   // Define memory allocation properties
   CUmemAllocationProp prop = {};
@@ -58,6 +107,7 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
   if (error_code != 0) {
@@ -67,6 +117,39 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   if (error_code != 0) {
     return;
   }
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUDA_CHECK(cuMemCreate(p_memHandle[i], chunk_sizes[i], &prop, 0));
+    if (error_code != 0) {
+      // Clean up previously created handles
+      for (auto j = 0; j < i; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+  }
+  unsigned long long allocated_size = 0;
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUDA_CHECK(cuMemMap(map_addr, chunk_sizes[i], 0, *(p_memHandle[i]), 0));
+    if (error_code != 0) {
+      // unmap previously mapped chunks
+      unsigned long long unmapped_size = 0;
+      for (auto j = 0; j < i; ++j) {
+        void* unmap_addr = (void*)((uintptr_t)d_mem + unmapped_size);
+        cuMemUnmap(unmap_addr, chunk_sizes[j]);
+        unmapped_size += chunk_sizes[j];
+      }
+      // release all created handles
+      for (auto j = 0; j < num_chunks; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+#endif
+
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = device;
@@ -82,10 +165,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 
 void unmap_and_release(unsigned long long device, ssize_t size,
                        CUdeviceptr d_mem,
+#ifndef USE_ROCM
                        CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                       CUmemGenericAllocationHandle** p_memHandle,
+                       unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
   // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
   ensure_context(device);
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemUnmap(d_mem, size));
   if (error_code != 0) {
     return;
@@ -94,6 +183,30 @@ void unmap_and_release(unsigned long long device, ssize_t size,
   if (error_code != 0) {
     return;
   }
+#else
+  unsigned long long allocated_size = 0;
+  CUresult first_error = no_error;
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUresult status = cuMemUnmap(map_addr, chunk_sizes[i]);
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUresult status = cuMemRelease(*(p_memHandle[i]));
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+  }
+
+  if (first_error != no_error) {
+    CUDA_CHECK(first_error);
+  }
+#endif
 }
 
 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -120,6 +233,36 @@ PyObject* create_tuple_from_c_integers(unsigned long long a,
   return tuple;  // Return the created tuple
 }
 
+PyObject* create_tuple_from_c_mixed(unsigned long long a, unsigned long long b,
+                                    unsigned long long c,
+                                    CUmemGenericAllocationHandle** vec,
+                                    unsigned long long* chunk_sizes,
+                                    size_t num_chunks) {
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;
+  }
+
+  // PyObject* list = PyList_New(vec.size());
+  PyObject* list = PyList_New(num_chunks);
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* addr_size_pair = PyTuple_New(2);
+    PyObject* addr = PyLong_FromUnsignedLongLong((unsigned long long)(vec[i]));
+    PyObject* size =
+        PyLong_FromUnsignedLongLong((unsigned long long)(chunk_sizes[i]));
+    PyTuple_SetItem(addr_size_pair, 0, addr);
+    PyTuple_SetItem(addr_size_pair, 1, size);
+    PyList_SetItem(list, i, addr_size_pair);
+  }
+
+  PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, list);
+
+  return tuple;
+}
+
 // ---------------------------------------------------------------------------
 // Our exported C functions that call Python:
 
@@ -147,14 +290,55 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
 
   CUdeviceptr d_mem;
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
   if (error_code != 0) {
     return nullptr;
   }
+#else
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, granularity, 0, 0));
+  if (error_code != 0) {
+    return nullptr;
+  }
+#endif
+
+#ifndef USE_ROCM
   // allocate the CUmemGenericAllocationHandle
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)malloc(
           sizeof(CUmemGenericAllocationHandle));
+#else
+  // Make sure chunk size is aligned with hardware granularity. The base
+  // chunk size can be configured via environment variable
+  // ``VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE``; otherwise
+  // DEFAULT_MEMCREATE_CHUNK_SIZE is used.
+  size_t base_chunk = (size_t)get_memcreate_chunk_size();
+  size_t aligned_chunk_size =
+      ((base_chunk + granularity - 1) / granularity) * granularity;
+  size_t num_chunks =
+      (alignedSize + aligned_chunk_size - 1) / aligned_chunk_size;
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  for (auto i = 0; i < num_chunks; ++i) {
+    p_memHandle[i] = (CUmemGenericAllocationHandle*)malloc(
+        sizeof(CUmemGenericAllocationHandle));
+    if (p_memHandle[i] == nullptr) {
+      std::cerr << "ERROR: malloc failed for p_memHandle[" << i << "].\n";
+      for (auto j = 0; j < i; ++j) {
+        free(p_memHandle[j]);
+      }
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)my_min(
+        (unsigned long long)(alignedSize - i * aligned_chunk_size),
+        (unsigned long long)aligned_chunk_size);
+  }
+#endif
 
   if (!g_python_malloc_callback) {
     std::cerr << "ERROR: g_python_malloc_callback not set.\n";
@@ -164,9 +348,15 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   // Acquire GIL (not in stable ABI officially, but often works)
   PyGILState_STATE gstate = PyGILState_Ensure();
 
+#ifndef USE_ROCM
   PyObject* arg_tuple = create_tuple_from_c_integers(
       (unsigned long long)device, (unsigned long long)alignedSize,
       (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+#else
+  PyObject* arg_tuple = create_tuple_from_c_mixed(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, p_memHandle, chunk_sizes, num_chunks);
+#endif
 
   // Call g_python_malloc_callback
   PyObject* py_result =
@@ -182,7 +372,27 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   PyGILState_Release(gstate);
 
   // do the final mapping
+#ifndef USE_ROCM
   create_and_map(device, alignedSize, d_mem, p_memHandle);
+#else
+  create_and_map(device, alignedSize, d_mem, p_memHandle, chunk_sizes,
+                 num_chunks);
+  free(chunk_sizes);
+#endif
+
+  if (error_code != 0) {
+    // free address and the handle
+    CUDA_CHECK(cuMemAddressFree(d_mem, alignedSize));
+#ifndef USE_ROCM
+    free(p_memHandle);
+#else
+    for (size_t i = 0; i < num_chunks; ++i) {
+      free(p_memHandle[i]);
+    }
+    free(p_memHandle);
+#endif
+    return nullptr;
+  }
 
   return (void*)d_mem;
 }
@@ -206,36 +416,96 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
 
   if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
     PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+  if (!PyArg_ParseTuple(py_result, PYARGS_PARSE, &recv_device, &recv_size,
                         &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
+  // For ROCm, copy the Python list of (addr,size) pairs into C arrays while
+  // holding the GIL. Then release the GIL and call the unmap/release helper
+  // using the copied arrays. This avoids calling PyList_* APIs without the
+  // GIL (which is undefined behavior and can crash when called from other
+  // threads).
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+#ifdef USE_ROCM
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for p_memHandle in my_free."
+              << std::endl;
+    return;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for chunk_sizes in my_free."
+              << std::endl;
+    return;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  // Drop temporary Python refs, then release the GIL before calling into
+  // non-Python APIs.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
   PyGILState_Release(gstate);
 
-  // recv_size == size
-  // recv_device == device
+  unmap_and_release(device, size, d_mem, p_memHandle, chunk_sizes, num_chunks);
+#else
+  // Non-ROCm path: simple integer handle already extracted; drop temporary
+  // Python refs while still holding the GIL, then release it.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);
 
-  // Free memory
-
-  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
   unmap_and_release(device, size, d_mem, p_memHandle);
+#endif
 
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
-  if (error_code != 0) {
-    return;
+#ifndef USE_ROCM
+  free(p_memHandle);
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    free(p_memHandle[i]);
   }
   free(p_memHandle);
+  free(chunk_sizes);
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -271,19 +541,87 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  if (!PyList_Check(recv_p_memHandle)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "Expected a list for the 4th argument on ROCm");
+    return nullptr;
+  }
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  if (num_chunks < 0) {
+    return nullptr;  // PyList_Size sets an exception on error.
+  }
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    if (item == nullptr || !PyTuple_Check(item) || PyTuple_Size(item) != 2) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      PyErr_SetString(
+          PyExc_TypeError,
+          "List items must be tuples of size 2 (handle_addr, size)");
+      return nullptr;
+    }
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    if (addr_py == nullptr || size_py == nullptr) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;  // PyTuple_GetItem sets an exception
+    }
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+  }
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                    num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
@@ -301,19 +639,56 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                 num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
diff --git a/csrc/cumem_allocator_compat.h b/csrc/cumem_allocator_compat.h
new file mode 100644
index 0000000000000..74f4bc9eeadf4
--- /dev/null
+++ b/csrc/cumem_allocator_compat.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#ifdef USE_ROCM
+////////////////////////////////////////
+// For compatibility with CUDA and ROCm
+////////////////////////////////////////
+  #include <hip/hip_runtime_api.h>
+
+extern "C" {
+  #ifndef CUDA_SUCCESS
+    #define CUDA_SUCCESS hipSuccess
+  #endif  // CUDA_SUCCESS
+
+// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+typedef unsigned long long CUdevice;
+typedef hipDeviceptr_t CUdeviceptr;
+typedef hipError_t CUresult;
+typedef hipCtx_t CUcontext;
+typedef hipStream_t CUstream;
+typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
+typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
+typedef hipMemAllocationProp CUmemAllocationProp;
+typedef hipMemAccessDesc CUmemAccessDesc;
+
+  #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+  #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+  #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+  #define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+  #define CU_MEM_ALLOCATION_COMP_NONE 0x0
+
+// Error Handling
+// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
+CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
+  *pStr = hipGetErrorString(hipError);
+  return CUDA_SUCCESS;
+}
+
+// Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
+CUresult cuCtxGetCurrent(CUcontext* ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxGetCurrent(ctx);
+}
+
+CUresult cuCtxSetCurrent(CUcontext ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxSetCurrent(ctx);
+}
+
+// Primary Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
+CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
+  return hipDevicePrimaryCtxRetain(ctx, dev);
+}
+
+// Virtual Memory Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return hipMemAddressFree(ptr, size);
+}
+
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags) {
+  return hipMemAddressReserve(ptr, size, alignment, addr, flags);
+}
+
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
+                     const CUmemAllocationProp* prop,
+                     unsigned long long flags) {
+  return hipMemCreate(handle, size, prop, flags);
+}
+
+CUresult cuMemGetAllocationGranularity(
+    size_t* granularity, const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return hipMemGetAllocationGranularity(granularity, prop, option);
+}
+
+CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags) {
+  return hipMemMap(dptr, size, offset, handle, flags);
+}
+
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return hipMemRelease(handle);
+}
+
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc* desc, size_t count) {
+  return hipMemSetAccess(ptr, size, desc, count);
+}
+
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return hipMemUnmap(ptr, size);
+}
+}  // extern "C"
+
+#else
+////////////////////////////////////////
+// Import CUDA headers for NVIDIA GPUs
+////////////////////////////////////////
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 995374a50b037..9ae0ed975edde 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -88,3 +88,32 @@
 #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(                                              \
       TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_VEC_SIZE(VEC_SIZE, ...) \
+  switch (VEC_SIZE) {                         \
+    case 16: {                                \
+      constexpr int vec_size = 16;            \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 8: {                                 \
+      constexpr int vec_size = 8;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 4: {                                 \
+      constexpr int vec_size = 4;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 2: {                                 \
+      constexpr int vec_size = 2;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    default: {                                \
+      constexpr int vec_size = 1;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+  }
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
new file mode 100644
index 0000000000000..83017250ebcd5
--- /dev/null
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "type_convert.cuh"
+
+#define CHECK_TYPE(x, st)                                              \
+  TORCH_CHECK(x.scalar_type() == st, #x " dtype is ", x.scalar_type(), \
+              ", while ", st, " is expected")
+#define CHECK_TH_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_TH_CUDA(x);    \
+  CHECK_CONTIGUOUS(x)
+
+#ifdef USE_ROCM
+  #define FINAL_MASK 0xffffffffffffffffULL
+#else
+  #define FINAL_MASK 0xffffffff
+#endif
+
+namespace tensorrt_llm::common {
+template <typename T, int num>
+struct packed_as;
+// Specialization for packed_as used in this kernel.
+template <>
+struct packed_as<uint, 1> {
+  using type = uint;
+};
+
+template <>
+struct packed_as<uint, 2> {
+  using type = uint2;
+};
+
+template <>
+struct packed_as<uint, 4> {
+  using type = uint4;
+};
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+inline __device__ __host__ T divUp(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+}  // namespace tensorrt_llm::common
+
+namespace tensorrt_llm::kernels {
+// NOTE(zhuhaoran): This kernel is adapted from TensorRT-LLM implementation,
+// with added support for passing the cos_sin_cache as an input.
+// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu
+
+// Perform per-head QK Norm and RoPE in a single kernel.
+// scalar_t_in: data type of QKV and RMSNorm weights
+// scalar_t_cache: data type of cos/sin cache
+// head_dim: the dimension of each head
+// interleave: interleave=!is_neox.
+template <typename scalar_t_in, typename scalar_t_cache, int head_dim,
+          bool interleave>
+__global__ void fusedQKNormRopeKernel(
+    void* qkv_void,                  // Combined QKV tensor
+    int const num_heads_q,           // Number of query heads
+    int const num_heads_k,           // Number of key heads
+    int const num_heads_v,           // Number of value heads
+    float const eps,                 // Epsilon for RMS normalization
+    void const* q_weight_void,       // RMSNorm weights for query
+    void const* k_weight_void,       // RMSNorm weights for key
+    void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
+    int64_t const* position_ids,     // Position IDs for RoPE
+    int const num_tokens             // Number of tokens
+) {
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
+                std::is_same_v<scalar_t_cache, c10::BFloat16>) {
+    return;
+  } else {
+#endif
+
+    using Converter = vllm::_typeConvert<scalar_t_in>;
+    static_assert(Converter::exists,
+                  "Input QKV data type is not supported for this CUDA "
+                  "architecture or toolkit version.");
+    using T_in = typename Converter::hip_type;
+    using T2_in = typename Converter::packed_hip_type;
+
+    using CacheConverter = vllm::_typeConvert<scalar_t_cache>;
+    static_assert(CacheConverter::exists,
+                  "Cache data type is not supported for this CUDA architecture "
+                  "or toolkit version.");
+    using T_cache = typename CacheConverter::hip_type;
+
+    T_in* qkv = reinterpret_cast<T_in*>(qkv_void);
+    T_in const* q_weight = reinterpret_cast<T_in const*>(q_weight_void);
+    T_in const* k_weight = reinterpret_cast<T_in const*>(k_weight_void);
+    T_cache const* cos_sin_cache =
+        reinterpret_cast<T_cache const*>(cos_sin_cache_void);
+
+    int const warpsPerBlock = blockDim.x / 32;
+    int const warpId = threadIdx.x / 32;
+    int const laneId = threadIdx.x % 32;
+
+    // Calculate global warp index to determine which head/token this warp
+    // processes
+    int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId;
+
+    // Total number of attention heads (Q and K)
+    int const total_qk_heads = num_heads_q + num_heads_k;
+
+    // Determine which token and head type (Q or K) this warp processes
+    int const tokenIdx = globalWarpIdx / total_qk_heads;
+    int const localHeadIdx = globalWarpIdx % total_qk_heads;
+
+    // Skip if this warp is assigned beyond the number of tokens
+    if (tokenIdx >= num_tokens) return;
+
+    bool const isQ = localHeadIdx < num_heads_q;
+    int const headIdx = isQ ? localHeadIdx : localHeadIdx - num_heads_q;
+
+    int const num_heads = num_heads_q + num_heads_k + num_heads_v;
+
+    static_assert(head_dim % (32 * 2) == 0,
+                  "head_dim must be divisible by 64 (each warp processes one "
+                  "head, and each thread gets even number of "
+                  "elements)");
+    constexpr int numElemsPerThread = head_dim / 32;
+    float elements[numElemsPerThread];
+    constexpr int elemSizeBytes = numElemsPerThread * sizeof(__nv_bfloat16);
+    static_assert(elemSizeBytes % 4 == 0,
+                  "numSizeBytes must be a multiple of 4");
+    constexpr int vecSize =
+        elemSizeBytes /
+        4;  // Use packed_as<uint, vecSize> to perform loading/saving.
+    using vec_T = typename tensorrt_llm::common::packed_as<uint, vecSize>::type;
+
+    int offsetWarp;  // Offset for the warp
+    if (isQ) {
+      // Q segment: token offset + head offset within Q segment
+      offsetWarp = tokenIdx * num_heads * head_dim + headIdx * head_dim;
+    } else {
+      // K segment: token offset + entire Q segment + head offset within K
+      // segment
+      offsetWarp = tokenIdx * num_heads * head_dim + num_heads_q * head_dim +
+                   headIdx * head_dim;
+    }
+    int offsetThread = offsetWarp + laneId * numElemsPerThread;
+
+    // Sum of squares for RMSNorm
+    float sumOfSquares = 0.0f;
+
+    // Load.
+    {
+      vec_T vec = *reinterpret_cast<vec_T const*>(&qkv[offsetThread]);
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Interpret the generic vector chunk as the specific packed type
+        T2_in packed_val = *(reinterpret_cast<T2_in*>(&vec) + i);
+        // Convert to float2 for computation
+        float2 vals = Converter::convert(packed_val);
+        sumOfSquares += vals.x * vals.x;
+        sumOfSquares += vals.y * vals.y;
+
+        elements[2 * i] = vals.x;
+        elements[2 * i + 1] = vals.y;
+      }
+    }
+
+    // Reduce sum across warp using the utility function
+    sumOfSquares = tensorrt_llm::common::warpReduceSum(sumOfSquares);
+
+    // Compute RMS normalization factor
+    float rms_rcp = rsqrtf(sumOfSquares / static_cast<float>(head_dim) + eps);
+
+    // Normalize elements
+#pragma unroll
+    for (int i = 0; i < numElemsPerThread; i++) {
+      int dim = laneId * numElemsPerThread + i;
+      float weight = isQ ? Converter::convert(q_weight[dim])
+                         : Converter::convert(k_weight[dim]);
+      elements[i] *= rms_rcp * weight;
+    }
+
+    // Apply RoPE to normalized elements
+    float elements2[numElemsPerThread];  // Additional buffer required for RoPE.
+
+    int64_t pos_id = position_ids[tokenIdx];
+
+    // Calculate cache pointer for this position - similar to
+    // pos_encoding_kernels.cu
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
+    int const embed_dim = head_dim / 2;
+    T_cache const* cos_ptr = cache_ptr;
+    T_cache const* sin_ptr = cache_ptr + embed_dim;
+
+    if constexpr (interleave) {
+      // Perform interleaving. Use pre-computed cos/sin values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread / 2; ++i) {
+        int const idx0 = 2 * i;
+        int const idx1 = 2 * i + 1;
+
+        float const val0 = elements[idx0];
+        float const val1 = elements[idx1];
+
+        int const dim_idx = laneId * numElemsPerThread + idx0;
+        int const half_dim = dim_idx / 2;
+        float const cos_val =
+            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float const sin_val =
+            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[idx0] = val0 * cos_val - val1 * sin_val;
+        elements[idx1] = val0 * sin_val + val1 * cos_val;
+      }
+    } else {
+      // Before data exchange with in warp, we need to sync.
+      __syncwarp();
+      // Get the data from the other half of the warp. Use pre-computed cos/sin
+      // values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread; i++) {
+        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
+        if (laneId < 16) {
+          elements2[i] = -elements2[i];
+        }
+
+        int dim_idx = laneId * numElemsPerThread + i;
+        dim_idx = (dim_idx * 2) % head_dim;
+        int half_dim = dim_idx / 2;
+        // Use pre-computed cos/sin from cache
+        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+      }
+      // __shfl_xor_sync does not provide memfence. Need to sync again.
+      __syncwarp();
+    }
+
+    // Store.
+    {
+      vec_T vec;
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Convert from float2 back to the specific packed type
+        T2_in packed_val = Converter::convert(
+            make_float2(elements[2 * i], elements[2 * i + 1]));
+        // Place it into the generic vector
+        *(reinterpret_cast<T2_in*>(&vec) + i) = packed_val;
+      }
+      *reinterpret_cast<vec_T*>(&qkv[offsetThread]) = vec;
+    }
+
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  }
+#endif
+}
+
+// Borrowed from
+// https://github.com/flashinfer-ai/flashinfer/blob/8125d079a43e9a0ba463a4ed1b639cefd084cec9/include/flashinfer/pos_enc.cuh#L568
+#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
+  if (interleave) {                                      \
+    const bool INTERLEAVE = true;                        \
+    __VA_ARGS__                                          \
+  } else {                                               \
+    const bool INTERLEAVE = false;                       \
+    __VA_ARGS__                                          \
+  }
+
+template <typename scalar_t_in, typename scalar_t_cache>
+void launchFusedQKNormRope(void* qkv, int const num_tokens,
+                           int const num_heads_q, int const num_heads_k,
+                           int const num_heads_v, int const head_dim,
+                           float const eps, void const* q_weight,
+                           void const* k_weight, void const* cos_sin_cache,
+                           bool const interleave, int64_t const* position_ids,
+                           cudaStream_t stream) {
+  constexpr int blockSize = 256;
+
+  int const warpsPerBlock = blockSize / 32;
+  int const totalQKHeads = num_heads_q + num_heads_k;
+  int const totalWarps = num_tokens * totalQKHeads;
+
+  int const gridSize = common::divUp(totalWarps, warpsPerBlock);
+  dim3 gridDim(gridSize);
+  dim3 blockDim(blockSize);
+
+  switch (head_dim) {
+    case 64:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 128:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 256:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported head dimension for fusedQKNormRope: ", head_dim);
+  }
+}
+}  // namespace tensorrt_llm::kernels
+
+void fused_qk_norm_rope(
+    torch::Tensor& qkv,       // Combined QKV tensor [num_tokens,
+                              // (num_heads_q+num_heads_k+num_heads_v)*head_dim]
+    int64_t num_heads_q,      // Number of query heads
+    int64_t num_heads_k,      // Number of key heads
+    int64_t num_heads_v,      // Number of value heads
+    int64_t head_dim,         // Dimension per head
+    double eps,               // Epsilon for RMS normalization
+    torch::Tensor& q_weight,  // RMSNorm weights for query [head_dim]
+    torch::Tensor& k_weight,  // RMSNorm weights for key [head_dim]
+    torch::Tensor& cos_sin_cache,  // Cos/sin cache [max_position, head_dim]
+    bool is_neox,                  // Whether RoPE is applied in Neox style
+    torch::Tensor& position_ids    // Position IDs for RoPE [num_tokens]
+) {
+  // Input validation
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(position_ids);
+  CHECK_INPUT(q_weight);
+  CHECK_INPUT(k_weight);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_TYPE(position_ids, torch::kInt64);
+
+  TORCH_CHECK(qkv.dim() == 2,
+              "QKV tensor must be 2D: [num_tokens, "
+              "(num_heads_q+num_heads_k+num_heads_v)*head_dim]");
+  TORCH_CHECK(position_ids.dim() == 1, "Position IDs must be 1D: [num_tokens]");
+  TORCH_CHECK(q_weight.dim() == 1, "Query weights must be 1D: [head_dim]");
+  TORCH_CHECK(k_weight.dim() == 1, "Key weights must be 1D: [head_dim]");
+  TORCH_CHECK(cos_sin_cache.dim() == 2,
+              "Cos/sin cache must be 2D: [max_position, head_dim]");
+  TORCH_CHECK(q_weight.size(0) == head_dim,
+              "Query weights size must match head dimension");
+  TORCH_CHECK(k_weight.size(0) == head_dim,
+              "Key weights size must match head dimension");
+  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
+              "Cos/sin cache dimension must match head_dim");
+  TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
+                  qkv.scalar_type() == k_weight.scalar_type(),
+              "qkv, q_weight and k_weight must have the same dtype");
+
+  int64_t num_tokens = qkv.size(0);
+  TORCH_CHECK(position_ids.size(0) == num_tokens,
+              "Number of tokens in position_ids must match QKV");
+
+  int64_t total_heads = num_heads_q + num_heads_k + num_heads_v;
+  TORCH_CHECK(
+      qkv.size(1) == total_heads * head_dim,
+      "QKV tensor size must match total number of heads and head dimension");
+
+  auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(qkv.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+    using qkv_scalar_t = scalar_t;
+    VLLM_DISPATCH_FLOATING_TYPES(
+        cos_sin_cache.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+          using cache_scalar_t = scalar_t;
+          tensorrt_llm::kernels::launchFusedQKNormRope<qkv_scalar_t,
+                                                       cache_scalar_t>(
+              qkv.data_ptr(), static_cast<int>(num_tokens),
+              static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
+              static_cast<int>(num_heads_v), static_cast<int>(head_dim),
+              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              cos_sin_cache.data_ptr(), !is_neox,
+              reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
+              stream);
+        });
+  });
+}
\ No newline at end of file
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 8cfcf9f41283a..48771e4b3aff9 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,7 +10,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
+template <typename scalar_t, int VEC_SIZE>
 __global__ void rms_norm_kernel(
     scalar_t* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -21,7 +21,6 @@ __global__ void rms_norm_kernel(
   float variance = 0.0f;
   const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  constexpr int VEC_SIZE = 8;
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
     for (int i = 0; i < VEC_SIZE; ++i) {
@@ -45,10 +44,20 @@ __global__ void rms_norm_kernel(
   }
   __syncthreads();
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
+  scalar_t* out_row = out + blockIdx.x * hidden_size;
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  auto* v_out = reinterpret_cast<vec_n_t<scalar_t, VEC_SIZE>*>(out_row);
+  for (int i = threadIdx.x; i < hidden_size / VEC_SIZE; i += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> dst;
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[i];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      dst.val[j] = ((scalar_t)(x * s_variance)) * src2.val[j];
+    }
+    v_out[i] = dst;
   }
 }
 
@@ -168,16 +177,24 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
   int num_tokens = input_view.numel() / hidden_size;
   int64_t input_stride = input_view.stride(-2);
 
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input_view.scalar_type(), "rms_norm_kernel", [&] {
-        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-            hidden_size);
+        const int calculated_vec_size =
+            std::gcd(16 / sizeof(scalar_t), hidden_size);
+        const int block_size =
+            std::min(hidden_size / calculated_vec_size, max_block_size);
+        dim3 block(block_size);
+        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
+              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
+              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+              hidden_size);
+        });
       });
 }
 
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index 0f7f034ee180b..0880b8d50a795 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -18,7 +18,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename fp8_type>
+template <typename scalar_t, typename fp8_type, int VEC_SIZE>
 __global__ void rms_norm_static_fp8_quant_kernel(
     fp8_type* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -31,7 +31,6 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 
   const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  constexpr int VEC_SIZE = 8;
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
     for (int i = 0; i < VEC_SIZE; ++i) {
@@ -58,11 +57,18 @@ __global__ void rms_norm_static_fp8_quant_kernel(
   // invert scale to avoid division
   float const scale_inv = 1.0f / *scale;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
-    out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  for (int idx = threadIdx.x; idx < hidden_size / VEC_SIZE; idx += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[idx];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[idx];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      float const out_norm = ((scalar_t)(x * s_variance)) * src2.val[j];
+      out[blockIdx.x * hidden_size + idx * VEC_SIZE + j] =
+          scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+    }
   }
 }
 
@@ -188,20 +194,29 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
   int input_stride = input.stride(-2);
   int num_tokens = input.numel() / hidden_size;
 
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
-              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      input_stride, weight.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), epsilon, num_tokens,
-                      hidden_size);
+              const int calculated_vec_size =
+                  std::gcd(16 / sizeof(scalar_t), hidden_size);
+              const int block_size =
+                  std::min(hidden_size / calculated_vec_size, max_block_size);
+              dim3 block(block_size);
+              VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+                vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t,
+                                                       vec_size>
+                    <<<grid, block, 0, stream>>>(
+                        out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                        input_stride, weight.data_ptr<scalar_t>(),
+                        scale.data_ptr<float>(), epsilon, num_tokens,
+                        hidden_size);
+              });
             });
       });
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index 3f5cb799b774c..f8bdc61aaa8ec 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,12 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
+                        int64_t num_heads_k, int64_t num_heads_v,
+                        int64_t head_dim, double eps, torch::Tensor& q_weight,
+                        torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
+                        bool is_neox, torch::Tensor& position_ids);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& prompt_mask,
                                  const torch::Tensor& output_mask,
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
index 147eb8efc0778..c40d499662714 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -48,7 +48,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
   using ElementBlockScale = float;
 
   using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
-        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::GMMA::Major::MN, cute::GMMA::Major::K>;
 
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9c0f524dcab11..c3ae06a30e3e8 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -175,6 +175,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Function for fused QK Norm and RoPE
+  ops.def(
+      "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
+      "int num_heads_k, int num_heads_v, int head_dim, float eps, "
+      "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
+      "bool is_neox, Tensor position_ids) -> ()");
+  ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
+
   // Apply repetition penalties to logits in-place
   ops.def(
       "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
index 21b9d0ae515df..2678f69e19b6c 100644
--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -29,6 +29,22 @@ struct _typeConvert {
   static constexpr bool exists = false;
 };
 
+template <>
+struct _typeConvert<float> {
+  static constexpr bool exists = true;
+  using hip_type = float;
+  using packed_hip_type = float2;
+  using packed_hip_type4 = float4;  // For 128-bit vectorization
+
+  __device__ static __forceinline__ float convert(hip_type x) { return x; }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
+    return x;
+  }
+  __device__ static __forceinline__ float4 convert(packed_hip_type4 x) {
+    return x;
+  }
+};
+
 #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
 // CUDA < 12.0 runs into issues with packed type conversion
 template <>
@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> {
   using hip_type = __half;
   using packed_hip_type = __half2;
 
-  __device__ static inline float convert(hip_type x) { return __half2float(x); }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
+    return __half2float(x);
+  }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __half22float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2half_rn(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22half2_rn(x);
   }
 };
 
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) || defined(USE_ROCM)
 // CUDA_ARCH < 800 does not have BF16 support
-// TODO: Add in ROCm support once public headers handle bf16 maturely
+// ROCm 7.0+ supports bfloat16
 template <>
 struct _typeConvert<c10::BFloat16> {
   static constexpr bool exists = true;
   using hip_type = __nv_bfloat16;
   using packed_hip_type = __nv_bfloat162;
 
-  __device__ static inline float convert(hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
     return __bfloat162float(x);
   }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __bfloat1622float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2bfloat16(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22bfloat162_rn(x);
   }
 };
-  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) ||
+          // defined(USE_ROCM)
 #endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
           // 12000))
 
@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp += T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] += other.data[i];
+          data[i + 1] += other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp += T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp *= T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] *= other.data[i];
+          data[i + 1] *= other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp *= T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index adaf8a3c5b084..4c961defaeda2 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,6 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
+#   VLLM_CPU_AMXBF16=false (default)|true
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -92,6 +93,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
+# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
+ARG VLLM_CPU_AMXBF16=0
+ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
 
diff --git a/docs/assets/features/disagg_encoder/disagg_encoder_flow.png b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png
new file mode 100644
index 0000000000000..2951468c11d9a
Binary files /dev/null and b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png differ
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 7ddd45799789c..0735f452df960 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,7 +1,16 @@
 # Meetups
 
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+We host regular meetups around the world. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights.
 
+## Upcoming Meetups
+
+Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_project), join our [Slack](https://slack.vllm.ai), and follow vLLM on [Luma](https://luma.com/vLLM-Meetups) to get notified about new events.
+
+## Past Meetups
+
+Below you'll find slides and recordings from our previous meetups:
+
+- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
 - [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
@@ -25,4 +34,12 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share
 - [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
 - [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
 
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
+## Get Involved
+
+**Want to host or speak at a vLLM meetup?** We're always looking for speakers and sponsors for our meetups. Whether you want to:
+
+- Share your vLLM feature, use case, project extension, or deployment experience
+- Host a meetup in your city
+- Sponsor an event
+
+Please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index 795bd5507a613..a7b54f015c2da 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -56,13 +56,13 @@ The initialization code should look like this:
 
 ### Computation Code
 
-- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+- Add a `embed_input_ids` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
 
 ```python
 class MyModel(nn.Module):
         ...
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         ... 
 ```
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 4e74afc688cf7..c2ca199220a1a 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -36,7 +36,7 @@ Further update the model as follows:
   
   More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
 
-- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+- Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
 
     ??? code
 
@@ -49,7 +49,7 @@ Further update the model as follows:
                 image_features = self.vision_encoder(image_input)
                 return self.multi_modal_projector(image_features)
 
-            def get_multimodal_embeddings(
+            def embed_multimodal(
                 self,
                 **kwargs: object,
             ) -> MultiModalEmbeddings | None:
@@ -69,7 +69,7 @@ Further update the model as follows:
 !!! note
     By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
     [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
-    This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
+    This logic can be found at [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids].
 
     You may override this method if additional logic is required for your model when merging embeddings. 
 
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index b56cf61e782c4..aac7b76eea265 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -177,8 +177,9 @@ The following table lists backends that support full CUDA Graphs at the time of
 | FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
 | Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
 | AITER FlashAttention | `UNIFORM_BATCH`| |
-| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
 | FlashMLA | `UNIFORM_BATCH` | |
+| FlashInferMLA | `UNIFORM_BATCH` | |
 | AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
@@ -218,16 +219,6 @@ outputs = model.generate(
 )
 ```
 
-### Migration from legacy flags
-
-Legacy `use_cudagraph` and `full_cuda_graph` are unified by `cudagraph_mode`:
-
-* `use_cudagraph=False` → `NONE`.
-* `use_cudagraph=True` and `full_cuda_graph=False` → `PIECEWISE`.
-* `full_cuda_graph=True` → directly set `FULL` and rely on the graceful fallback policy.
-
-As they are deprecated and will be removed in the next major or minor release, i.e., v0.11.0 or v1.0.0, we recommend using cudagraph_mode instead.
-
 ### Piecewise compilation and full graph custom passes (attention fusion, sequence parallelism)
 
 Unfortunately, some custom compile passes have to see the whole graph to be effective and hence aren't compatible with piecewise compilation. This includes `AttnFusionPass` and `SequenceParallelismPass`. As a short-term solution, we automatically disable piecewise compilation (by setting `splitting_ops=[]`) when attention fusion is enabled. We use CUDA Graph modes `FULL` or `FULL_DECODE_ONLY` (depending on backend support). However, this leads to another optimization incompatibility and confusing performance tradeoffs.
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
new file mode 100644
index 0000000000000..7d40af7069822
--- /dev/null
+++ b/docs/features/disagg_encoder.md
@@ -0,0 +1,75 @@
+# Disaggregated Encoder
+
+A **disaggregated encoder** runs the vision-encoder stage of a multimodal LLM in a process that is separate from the pre-fill / decoder stage. Deploying these two stages in independent vLLM instances brings three practical benefits:
+
+1. **Independent, fine-grained scaling**  
+2. **Lower time-to-first-token (TTFT)**  
+3. **Cross-process reuse and caching of encoder outputs**
+
+Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-CpnuLLzmR8l9BAE>
+
+---
+
+## 1  Motivation
+
+### 1. Independent, fine-grained scaling
+
+* Vision encoders are lightweight, while language models are orders of magnitude larger.  
+* The language model can be parallelised without affecting the encoder fleet.  
+* Encoder nodes can be added or removed independently.
+
+### 2. Lower time-to-first-token (TTFT)
+
+* Language-only requests bypass the vision encoder entirely.  
+* Encoder output is injected only at required attention layers, shortening the pre-fill critical path.
+
+### 3. Cross-process reuse and caching
+
+* In-process encoders confine reuse to a single worker.  
+* A remote, shared cache lets any worker retrieve existing embeddings, eliminating redundant computation.
+
+---
+
+## 2  Usage Example
+
+The current reference pathway is **SharedStorageConnector**.  
+Below ready-to-run scripts shows the workflow:
+
+1 Encoder instance + 1 PD instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+
+1 Encoder instance + 1 Prefill instance + 1 Decode instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+
+---
+
+## 3  Test Script
+
+Please refer to the directories `tests/v1/ec_connector`
+
+## 4  Development
+
+Disaggregated encoding is implemented by running two parts:
+
+* **Encoder instance** – a vLLM instance to performs vision encoding.  
+* **Prefill/Decode (PD) instance(s)** – runs language pre-fill and decode.
+    * PD can be in either a single normal instance with `disagg_encoder_example.sh` (E->PD) or in disaggregated instances with `disagg_epd_example.sh` (E->P->D)
+
+A connector transfers encoder-cache (EC) embeddings from the encoder instance to the PD instance.  
+All related code is under `vllm/distributed/ec_transfer`.
+
+### Key abstractions
+
+* **ECConnector** – interface for retrieving EC caches produced by the encoder.  
+    * *Scheduler role* – checks cache existence and schedules loads.  
+    * *Worker role* – loads the embeddings into memory.
+
+Here is a figure illustrating disaggregate encoder flow:
+
+![Disaggregated Encoder Flow](../assets/features/disagg_encoder/disagg_encoder_flow.png)
+
+For the PD disaggregation part, the Prefill instance receive cache exactly the same as the disaggregate encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfer KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execute of the PDinstance.
+
+`docs/features/disagg_prefill.md` shows the brief idea about the disaggregated prefill (v0)
+
+We create the example setup with the **NixlConnector** from `vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py` and referred to the `tests/v1/kv_connector/nixl_integration/toy_proxy_server.py` to facilitate the kv transfer between P and D;
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 385e3bbb8712f..be0702f4c9e16 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -281,4 +281,36 @@ python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
     --group_size 32
 ```
 
-The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations. Eventually, some target hardware support mixed precision GEMM, as AMD Instinct MI350/MI355, for example using FP6 for activations and FP4 for weights.
+The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations.
+
+## Using Quark Quantized layerwise Auto Mixed Precision (AMP) Models
+
+vLLM also supports loading layerwise mixed precision model quantized using AMD Quark. Currently, mixed scheme of {MXFP4, FP8} is supported, where FP8 here denotes for FP8 per-tensor scheme. More mixed precision schemes are planned to be supported in a near future, including
+
+- Unquantized Linear and/or MoE layer(s) as an option for each layer, i.e., mixed of {MXFP4, FP8, BF16/FP16}
+- MXFP6 quantization extension, i.e., {MXFP4, MXFP6, FP8, BF16/FP16}
+
+Although one can maximize serving throughput using the lowest precision supported on a given device (e.g. MXFP4 for AMD Instinct MI355, FP8 for AMD Instinct MI300), these aggressive schemes can be detrimental to accuracy recovering from quantization on target tasks. Mixed precision allows to strike a balance between maximizing accuracy and throughput.
+
+There are two steps to generate and deploy a mixed precision model quantized with AMD Quark, as shown below.
+
+### 1. Quantize a model using mixed precision in AMD Quark
+
+Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
+
+As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
+
+- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+
+### 2. inference the quantized mixed precision model in vLLM
+
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+
+```bash
+lm_eval --model vllm \
+    --model_args pretrained=amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8,tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False \
+    --tasks mmlu \
+    --batch_size auto
+```
diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index edcbaa7164479..9ab167ab9a237 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -11,7 +11,7 @@ Key benefits:
 - **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
 
 !!! note
-    This feature is only supported on CUDA platform.
+    This feature is now supported on CUDA and ROCm platform.
 
 !!! note
     For more information, see this [Blog Post](https://blog.vllm.ai/2025/10/26/sleep-mode.html).
@@ -116,3 +116,7 @@ curl -X POST 'http://localhost:8000/wake_up?tags=kv_cache'
 
 !!! note
     These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.
+
+## Limitation
+
+On ROCm, the virtual memory allocation on ROCm is done through chunked memory allocation. You can control the chunk size through `VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE` (in MB). The default value is set at 256MB. The larger the chunk size the faster the performance. However, setting it too large will cause OOM. So if you encounter OOM when using sleep mode. Try reducing the chunk size. It is recommended to define the chunk size as a power of 2.
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 2369eaed1802e..e8bfca0e5e88f 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -93,7 +93,7 @@ Currently, there are no pre-built CPU wheels.
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM to run more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
@@ -128,7 +128,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
 
-- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following.
+- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
 
 - On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
@@ -156,12 +156,12 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
     14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
     15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
 
-    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
     $ export VLLM_CPU_OMP_THREADS_BIND=0-7
     $ python examples/offline_inference/basic/basic.py
     ```
 
-- When deploy vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on a same NUMA node to avoid cross NUMA node memory access.
+- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
 
 ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
 
@@ -171,7 +171,9 @@ This value is 4GB by default. Larger space can support more concurrent requests,
 
 First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
 
-Inference batch size is an important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
+Use multiples of 32 as `--block-size`, which is 128 by default.
+
+Inference batch size is an important parameter for the performance. A larger batch usually provides higher throughput, a smaller batch provides lower latency. Tuning the max batch size starting from the default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
 
 - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
     - Offline Inference: `4096 * world_size`
@@ -192,8 +194,8 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
 - Both of them require `amx` CPU flag.
-    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
-    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+    - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
+    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
 
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index eed1b3fb4bc85..c1eb207efcd18 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -75,7 +75,12 @@ This section details the necessary modifications to make to a Transformers compa
 To make your model compatible with the Transformers backend, it needs:
 
 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
-    1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
+    - If your model is encoder-only:
+        1. Add `is_causal = False` to `MyAttention`.
+    - If your model is mixture-of-experts (MoE):
+        1. Your sparse MoE block must have an attribute called `experts`.
+        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
 
@@ -102,6 +107,23 @@ class MyAttention(nn.Module):
         )
         ...
 
+# Only do this for mixture-of-experts models
+class MyExperts(nn.ModuleList):
+    def forward(self, hidden_states, top_k_index, top_k_weights):
+        ...
+
+# Only do this for mixture-of-experts models
+class MySparseMoEBlock(nn.Module):
+    def __init__(self, config):
+        ...
+        self.experts = MyExperts(config)
+        ...
+
+    def forward(self, hidden_states: torch.Tensor):
+        ...
+        hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)
+        ...
+
 class MyModel(PreTrainedModel):
     _supports_attention_backend = True
 ```
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index e331b3422ea64..821628e6e3174 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -77,11 +77,11 @@ In addition, we have the following custom APIs:
 
 In order for the language model to support chat protocol, vLLM requires the model to include
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+specifies how roles, messages, and other chat-specific tokens are encoded in the input.
 
 An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those models,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
 template, or the template in string form. Without a chat template, the server will not be able to process chat
 and all chat requests will error.
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index d5b1b4ad29a92..6a533eb5c937f 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -4,8 +4,7 @@
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
 
-Note that still not support `v1`:
-VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+Note that this example is out of date and not supported in vLLM v1.
 """
 
 import gc
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index 16d44cbadbc98..d8fb50d7fe55c 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
 
 # Read vision and audio inputs from a single video file
 # NOTE: V1 engine does not support interleaved modalities yet.
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q use_audio_in_video
 
 # Multiple audios
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q multi_audios
 ```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 6fbe1303f431a..ed005e6a69b80 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
 
 from typing import NamedTuple
 
-import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
     )
     asset = VideoAsset(name="baby_reading", num_frames=16)
     audio = asset.get_audio(sampling_rate=16000)
-    assert not envs.VLLM_USE_V1, (
-        "V1 does not support use_audio_in_video. "
-        "Please launch this example with "
-        "`VLLM_USE_V1=0`."
-    )
+
     return QueryResult(
         inputs={
             "prompt": prompt,
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
new file mode 100644
index 0000000000000..5813a3cecf73b
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -0,0 +1,119 @@
+# Disaggregated Encoder
+
+These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
+
+For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
+
+## Files
+
+- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
+
+- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
+```
+
+## Encoder Instances
+
+Encoder engines should be launched with the following flags:
+
+- `--enforce-eager` **(required)** – The current EPD implementation is only compatible with encoder instances running in this mode.
+
+- `--no-enable-prefix-caching` **(required)** – Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
+
+- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
+
+## Local media inputs
+
+To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
+
+```bash
+--allowed-local-media-path $MEDIA_PATH
+```
+
+The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
+
+## EC connector and KV transfer
+
+The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+
+```bash
+# Add to encoder instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_producer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+
+# Add to prefill/prefill+decode instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_consumer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+```
+
+`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
+
+If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
+
+```bash
+# Add to prefill instance:    
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_producer"
+}' 
+
+# Add to decode instance:
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_consumer"
+}' 
+```
+
+## Proxy Instance Flags (`disagg_epd_proxy.py`)
+
+| Flag | Description |
+|------|-------------|
+| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
+| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
+| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
+| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
+
+Example usage:
+For E + PD setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8002" \
+      --prefill-servers-urls "disable" \
+      --decode-servers-urls "http://pd1:8003,http://pd2:8004"
+```
+
+For E + P + D setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8001" \
+      --prefill-servers-urls "http://p1:8003,http://p2:8004" \ 
+      --decode-servers-urls "http://d1:8005,http://d2:8006"
+```
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
new file mode 100644
index 0000000000000..57489df64f51e
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-2}"
+GPU_P="${GPU_P:-2}"
+GPU_D="${GPU_D:-3}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+export UCX_TLS=all
+export UCX_NET_DEVICES=all
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+P_LOG=$LOG_PATH/p_${START_TIME}.log
+D_LOG=$LOG_PATH/d_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_P" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_producer"
+    }' \
+    >"${P_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_D" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_consumer"
+    }' \
+    >"${D_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_PORT
+wait_for_server $DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+    --decode-servers-urls "http://localhost:$DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
new file mode 100644
index 0000000000000..6073e0580b11d
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-0}"
+GPU_PD="${GPU_PD:-1}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+PD_LOG=$LOG_PATH/pd_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill+Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${PD_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "disable" \
+    --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
new file mode 100644
index 0000000000000..b5f99683c2bf3
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+disagg_encoder_proxy.py
+
+Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
+clusters:
+  • encode  (multimodal feature extraction)
+  • decode  (language-model inference)
+
+For MM input we:
+    1. Extract *every* image/audio item.
+    2. Fire N concurrent requests to the encoder cluster
+       (one request per item, with **all text removed**).
+    3. Wait for all of them to succeed.
+    4. Forward the *original* request to a decode server.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import random
+import uuid
+from collections.abc import AsyncIterator
+
+import aiohttp
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+###############################################################################
+# FastAPI app & global state
+###############################################################################
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
+)
+logger = logging.getLogger("proxy")
+
+app = FastAPI()
+encode_session: aiohttp.ClientSession | None = None
+prefill_session: aiohttp.ClientSession | None = None
+decode_session: aiohttp.ClientSession | None = None
+
+###############################################################################
+# Utils
+###############################################################################
+
+
+MM_TYPES = {"image_url", "audio_url", "input_audio"}
+
+
+def extract_mm_items(request_data: dict) -> list[dict]:
+    """
+    Return *all* image/audio items that appear anywhere in `messages`.
+
+    Each returned dict looks like:
+        { "type": "image_url", "image_url": {...} }
+    """
+    items: list[dict] = []
+    for msg in request_data.get("messages", []):
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+
+        for item in content:
+            if item.get("type") in MM_TYPES:
+                items.append(item)
+    return items
+
+
+async def fanout_encoder_primer(
+    orig_request: dict,
+    e_urls: list[str],
+    req_id: str,
+) -> None:
+    """
+    1. Build one request *per MM item* with all text removed.
+    2. Send them concurrently to the encode cluster.
+    3. Raise if any of them fails.
+    """
+    logger.info("[%s] Processing multimodal items...", req_id)
+
+    mm_items = extract_mm_items(orig_request)
+    if not mm_items:
+        logger.info("[%s] No multimodal items, skipping encoder", req_id)
+        return  # nothing to do
+
+    logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
+
+    tasks = []
+
+    # Round-robin over encode servers to distribute load a bit
+    url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
+
+    for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
+        # Derive a *child* request id:  <parent>:<index>:<random-short>
+        child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
+        headers = {"x-request-id": child_req_id}
+
+        encoder_req = {
+            # You *may* need to keep additional fields
+            "model": orig_request.get("model"),
+            "messages": [
+                {"role": "user", "content": [item]},
+            ],
+            # Only need 1 token so the server actually runs the encoder path
+            "max_tokens": 1,
+            "stream": False,
+        }
+        tasks.append(
+            encode_session.post(
+                f"{target_url}/v1/chat/completions",
+                json=encoder_req,
+                headers=headers,
+            )
+        )
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Fail fast if any sub-request failed
+    for idx, r in enumerate(results):
+        if isinstance(r, Exception):
+            logger.error(
+                "[%s] Encoder request #%d raised exception: %s",
+                req_id,
+                idx,
+                r,
+                exc_info=r,
+            )
+            raise HTTPException(
+                status_code=502, detail=f"Encoder request failed: {str(r)}"
+            )
+        if r.status != 200:
+            try:
+                detail = await r.text()
+            except Exception:
+                detail = "<unable to read body>"
+            logger.error(
+                "[%s] Encoder request #%d returned status %s: %s",
+                req_id,
+                idx,
+                r.status,
+                detail,
+            )
+            raise HTTPException(
+                status_code=r.status,
+                detail=f"Encoder request failed: {detail}",
+            )
+
+    logger.info(
+        "[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
+    )
+
+
+async def maybe_prefill(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """
+    - Do prefill-only task if p_url exist;
+    - Return modified request data with kv transfer params (for nixl connector)
+    - Else, skip and return the original request data for decode
+    """
+    if p_url:
+        logger.info("[%s] Processing through prefill: %s", req_id, p_url)
+
+        prefill_response = await process_prefill_stage(req_data, p_url, req_id)
+        # for nixl connector to facilitate kv transfer...
+        prefill_response_json = await prefill_response.json()
+        kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
+
+        return req_data
+    else:
+        return req_data
+
+
+async def process_prefill_stage(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """Process request through Prefill stage and return kv_transfer_params"""
+    logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
+
+    prefill_request = req_data.copy()
+    prefill_request["kv_transfer_params"] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    prefill_request["stream"] = False
+    prefill_request["max_tokens"] = 1
+    if "max_completion_tokens" in prefill_request:
+        prefill_request["max_completion_tokens"] = 1
+    if "stream_options" in prefill_request:
+        del prefill_request["stream_options"]
+
+    headers = {"x-request-id": req_id}
+    try:
+        prefill_response = await prefill_session.post(
+            f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
+        )
+        prefill_response.raise_for_status()
+
+        if prefill_response.status != 200:
+            error_text = await prefill_response.text()
+            logger.error(
+                "[%s] Prefill request failed with status %d: %s",
+                req_id,
+                prefill_response.status,
+                error_text,
+            )
+            raise HTTPException(
+                status_code=prefill_response.status,
+                detail={"error": "Prefill request failed", "message": error_text},
+            )
+        logger.info("[%s] Prefill request completed successfully", req_id)
+
+        return prefill_response
+
+    except Exception as e:
+        logger.error("Prefill processing failed: %s", str(e))
+        raise HTTPException(
+            status_code=500,
+            detail={"error": "Prefill processing error", "message": str(e)},
+        ) from e
+
+
+###############################################################################
+# Middleware for request/response logging
+###############################################################################
+
+
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Middleware to log all incoming requests and responses"""
+    req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+    # Log incoming request
+    logger.info(
+        ">>> [%s] %s %s from %s",
+        req_id,
+        request.method,
+        request.url.path,
+        request.client.host if request.client else "unknown",
+    )
+
+    try:
+        # Process request
+        response = await call_next(request)
+
+        # Log response
+        logger.info(
+            "<<< [%s] %s %s completed with status %d",
+            req_id,
+            request.method,
+            request.url.path,
+            response.status_code,
+        )
+
+        return response
+    except Exception as e:
+        # Log errors
+        logger.exception(
+            "!!! [%s] %s %s failed with error: %s",
+            req_id,
+            request.method,
+            request.url.path,
+            str(e),
+        )
+        raise
+
+
+###############################################################################
+# FastAPI lifecycle
+###############################################################################
+
+
+@app.on_event("startup")
+async def on_startup() -> None:
+    global encode_session, prefill_session, decode_session
+    timeout = aiohttp.ClientTimeout(total=100_000)
+    connector = aiohttp.TCPConnector(limit=0, force_close=False)
+    encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    if app.state.p_urls:
+        # only setup if prefill instance(s) exist
+        prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+
+
+@app.on_event("shutdown")
+async def on_shutdown() -> None:
+    global encode_session, prefill_session, decode_session
+    if encode_session:
+        await encode_session.close()
+    if prefill_session:
+        await prefill_session.close()
+    if decode_session:
+        await decode_session.close()
+
+
+###############################################################################
+# Core forwarding
+###############################################################################
+
+
+async def forward_non_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> dict:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Non-streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions", json=req_data, headers=headers
+        ) as resp:
+            resp.raise_for_status()
+            return await resp.json()
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
+        raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
+
+
+async def forward_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> AsyncIterator[str]:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions",
+            json=req_data,
+            headers=headers,
+        ) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024):
+                if chunk:
+                    yield chunk.decode("utf-8", errors="ignore")
+
+        logger.info("[%s] Streaming completed", req_id)
+
+    except HTTPException:
+        logger.exception("[%s] HTTPException in forward_stream", req_id)
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Proxy streaming error: {str(e)}"
+        ) from e
+
+
+###############################################################################
+# Public routes
+###############################################################################
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        req_data = await request.json()
+        req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+        e_urls = app.state.e_urls  # we want the full list for fan-out
+        p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+        d_url = random.choice(app.state.d_urls)
+
+        is_streaming = req_data.get("stream", False)
+
+        if is_streaming:
+            return StreamingResponse(
+                forward_stream(req_data, req_id, e_urls, p_url, d_url),
+                media_type="text/event-stream",
+            )
+        result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
+        return JSONResponse(content=result)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error in chat_completions endpoint: %s", str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Request processing error: {str(e)}"
+        ) from e
+
+
+@app.get("/v1/models")
+async def list_models():
+    async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
+        resp.raise_for_status()
+        return await resp.json()
+
+
+@app.get("/health")
+async def health_check():
+    async def healthy(urls):
+        if not urls:
+            return "empty"
+        for u in urls:
+            try:
+                async with encode_session.get(f"{u}/health") as resp:
+                    resp.raise_for_status()
+            except Exception:
+                return "unhealthy"
+        return "healthy"
+
+    e_status, p_status, d_status = await asyncio.gather(
+        healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
+    )
+
+    overall_healthy = all(
+        status != "unhealthy" for status in (e_status, p_status, d_status)
+    )
+
+    status_code = 200 if overall_healthy else 503
+
+    return JSONResponse(
+        {
+            "proxy": "healthy",
+            "encode_cluster": e_status,
+            "prefill_cluster": p_status,
+            "decode_cluster": d_status,
+        },
+        status_code=status_code,
+    )
+
+
+###############################################################################
+# Simple profiler fan-out (unchanged except for sessions)
+###############################################################################
+
+
+async def _post_if_available(
+    session: aiohttp.ClientSession,
+    url: str,
+    payload: dict,
+    headers: dict,
+) -> dict | None:
+    """
+    POST `payload` to `url`.
+
+    Returns
+    -------
+    • The decoded JSON body on success (2xx)
+    • None if the endpoint does not exist (404)
+    • Raises for anything else.
+    """
+    try:
+        resp = await session.post(url, json=payload, headers=headers)
+        if resp.status == 404:  # profiling disabled on that server
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        resp.raise_for_status()
+        return await resp.json(content_type=None)
+    except aiohttp.ClientResponseError as exc:
+        # Pass 404 through the branch above, re-raise everything else
+        if exc.status == 404:
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        raise
+    except Exception:
+        # Network errors etc.: propagate
+        raise
+
+
+async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
+    """
+    Fire & forget to both clusters, tolerate 404.
+    """
+    headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
+
+    encode_task = _post_if_available(
+        encode_session, f"{e_url}/{cmd}_profile", payload, headers
+    )
+    prefill_task = (
+        _post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
+        if p_url is not None
+        else asyncio.sleep(0)
+    )
+    decode_task = _post_if_available(
+        decode_session, f"{d_url}/{cmd}_profile", payload, headers
+    )
+
+    encode_res, prefill_res, decode_res = await asyncio.gather(
+        encode_task, prefill_task, decode_task
+    )
+
+    # If *all* clusters said “I don’t have that route”, surface an error
+    if encode_res is prefill_res is decode_res is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Profiling endpoints are disabled on all clusters",
+        )
+
+    return {
+        "encode": encode_res,  # may be None
+        "prefill": prefill_res,  # may be None
+        "decode": decode_res,  # may be None
+    }
+
+
+@app.post("/start_profile")
+async def start_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("start", body, e_url, p_url, d_url)
+
+
+@app.post("/stop_profile")
+async def stop_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("stop", body, e_url, p_url, d_url)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--encode-servers-urls",
+        required=True,
+        help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
+    )
+    parser.add_argument(
+        "--prefill-servers-urls",
+        required=True,
+        help=(
+            'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
+            'to enable E->P->D, set "disable" or "none" to enable E->PD',
+        ),
+    )
+    parser.add_argument(
+        "--decode-servers-urls",
+        required=True,
+        help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
+    )
+
+    args = parser.parse_args()
+    app.state.e_urls = [
+        u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
+    ]
+    app.state.d_urls = [
+        u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
+    ]
+    # handle prefill instances
+    if args.prefill_servers_urls.lower() in ("disable", "none", ""):
+        app.state.p_urls = []
+        logger.info(
+            "Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
+        )
+    else:
+        app.state.p_urls = [
+            u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
+        ]
+        logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
+
+    logger.info("Proxy listening on %s:%s", args.host, args.port)
+    logger.info("Encode servers: %s", app.state.e_urls)
+    logger.info("Prefill instances %s", app.state.p_urls)
+    logger.info("Decode servers: %s", app.state.d_urls)
+
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+        loop="uvloop",
+        access_log=True,
+    )
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index e10ee4e2a9a9a..53036b3eb0ff3 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
 from vllm.engine.arg_utils import EngineArgs
 
 
-def setup_environment_variables(vllm_version: str):
+def setup_environment_variables():
     # LMCache-related environment variables
     # Use experimental features in LMCache
     os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
     os.environ["LMCACHE_LOCAL_CPU"] = "True"
     # Set local CPU memory limit to 5.0 GB
     os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-    if vllm_version == "v0":
-        os.environ["VLLM_USE_V1"] = "0"
 
 
 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str):
     ktc = KVTransferConfig(
         kv_connector=lmcache_connector,
         kv_role="kv_both",
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
     # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    if vllm_version == "v0":
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-            enable_chunked_prefill=True,  # Only in v0
-        )
-    else:
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-        )
+    llm_args = EngineArgs(
+        model=model,
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+    )
 
     llm = LLM(**asdict(llm_args))
     try:
@@ -116,18 +105,10 @@ def parse_args():
 
 
 def main():
-    args = parse_args()
-
-    if args.version == "v0":
-        lmcache_connector = "LMCacheConnector"
-        model = "mistralai/Mistral-7B-Instruct-v0.2"
-    else:
-        lmcache_connector = "LMCacheConnectorV1"
-        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-
-    setup_environment_variables(args.version)
-
-    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
+    lmcache_connector = "LMCacheConnectorV1"
+    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    setup_environment_variables()
+    with build_llm_with_lmcache(lmcache_connector, model) as llm:
         # This example script runs two requests with a shared prefix.
         # Define the shared prompt and specific prompts
         shared_prompt = "Hello, how are you?" * 1000
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 4e393d6b66152..76874cbd2f482 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,7 +9,6 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
-xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
+xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 56ec90c563c04..51f58e57a7851 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -9,7 +9,7 @@ torchaudio==2.9.0
 triton==3.5.0
 cmake>=3.26.1,<4
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index f06e4248a7242..6f1cca90e5e2b 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -10,7 +10,7 @@ peft
 pytest-asyncio
 tensorizer==2.10.1
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
diff --git a/setup.py b/setup.py
index 8139d0d62b8ac..0934a8608eb12 100644
--- a/setup.py
+++ b/setup.py
@@ -208,6 +208,8 @@ class cmake_build_ext(build_ext):
         # Make sure we use the nvcc from CUDA_HOME
         if _is_cuda():
             cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
+        elif _is_hip():
+            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
 
         other_cmake_args = os.environ.get("CMAKE_ARGS")
         if other_cmake_args:
@@ -628,6 +630,7 @@ ext_modules = []
 
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -643,7 +646,6 @@ if _is_cuda():
         ext_modules.append(
             CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
         )
-    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 0c037622f5e82..754ef20dbeb2b 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -8,12 +8,13 @@ import torch
 
 from vllm import LLM, AsyncEngineArgs, AsyncLLMEngine, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.platforms import current_platform
 from vllm.utils.mem_constants import GiB_bytes
 
 from ..utils import create_new_process_for_each_test
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_python_error():
     """
     Test if Python error occurs when there's low-level
@@ -39,7 +40,7 @@ def test_python_error():
         allocator.wake_up()
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_basic_cumem():
     # some tensors from default memory pool
     shape = (1024, 1024)
@@ -72,7 +73,7 @@ def test_basic_cumem():
     assert torch.allclose(output, torch.ones_like(output) * 3)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_cumem_with_cudagraph():
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
@@ -117,7 +118,7 @@ def test_cumem_with_cudagraph():
     assert torch.allclose(y, x + 1)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 @pytest.mark.parametrize(
     "model",
     [
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index 700f57ffb0681..64d626bae483d 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -203,7 +203,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -281,7 +281,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=False,
+            cudagraph_mode=CUDAGraphMode.NONE,
             splitting_ops=["silly::attention"],
             use_inductor_graph_partition=use_inductor_graph_partition,
         )
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 228859532ef4e..a48af8a8952ad 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -62,7 +62,6 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             use_inductor=use_inductor,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 6887673eb6a5b..92998ede16992 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -449,7 +449,6 @@ def benchmark():
         if piecewise:
             compilation_config = CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                use_cudagraph=True,
                 splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 132a838b8d44c..3f6898607f6b9 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -127,7 +127,9 @@ def test_compile_correctness(
             CompilationMode.VLLM_COMPILE,
         ]:
             for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
+                all_args.append(
+                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                )
 
             # inductor will change the output, so we only compare if the output
             # is close, not exactly the same.
@@ -146,7 +148,7 @@ def test_compile_correctness(
             CompilationMode.DYNAMO_TRACE_ONCE,
             CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 7455147f2b95a..bb66ef5529b12 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -2,8 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from contextlib import nullcontext
+from unittest.mock import patch
 
 import pytest
+from pydantic import ValidationError
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
@@ -11,7 +13,7 @@ from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.config.compilation import CompilationMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
+from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
 
 def test_version():
@@ -23,14 +25,6 @@ def test_version():
     assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
 
-def test_use_cudagraphs_dynamic():
-    vllm_config = VllmConfig()
-    # Default V1 configuration now starts without cudagraphs enabled; the
-    # engine decides when to capture based on runtime settings instead of a
-    # blanket default.
-    assert vllm_config.compilation_config.use_cudagraph
-
-
 def test_copy_pass():
     vllm_config = VllmConfig()
     inductor_pass = FixFunctionalizationPass(vllm_config)
@@ -65,7 +59,7 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
 
     compilation_config = {
-        "use_cudagraph": False,  # speed things up a bit
+        "cudagraph_mode": CUDAGraphMode.NONE,  # speed things up a bit
     }
     with (
         compilation_counter.expect(
@@ -83,20 +77,31 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-@pytest.mark.parametrize("enabled", [True, False])
-def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
+@pytest.mark.parametrize(
+    "cudagraph_mode,num_cudagraph_captured",
+    [
+        (CUDAGraphMode.NONE, 0),
+        (CUDAGraphMode.FULL_DECODE_ONLY, 1),
+        (CUDAGraphMode.PIECEWISE, 13),
+        (CUDAGraphMode.FULL_AND_PIECEWISE, 14),
+    ],
+)
+def test_use_cudagraphs(
+    vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
+):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     compilation_config = {
         "cudagraph_capture_sizes": [100],
-        "use_cudagraph": enabled,
+        "cudagraph_mode": cudagraph_mode,
     }
+    num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
     with (
         compilation_counter.expect(
             num_graphs_seen=1,
-            num_gpu_runner_capture_triggers=1 if enabled else 0,
-            num_cudagraph_captured=13 if enabled else 0,
+            num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
+            num_cudagraph_captured=num_cudagraph_captured,
         ),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
@@ -168,19 +173,18 @@ def test_splitting_ops_dynamic():
     assert not config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
-    if is_torch_equal_or_newer("2.9.0.dev"):
-        config = VllmConfig(
-            compilation_config=CompilationConfig(
-                mode=CompilationMode.VLLM_COMPILE,
-                use_inductor_graph_partition=True,
-                splitting_ops=["vllm::unified_attention"],
-            )
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            splitting_ops=["vllm::unified_attention"],
         )
-        # with inductor partition we use splitting_ops directly for
-        # partition rules
-        assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
+    )
+    # with inductor partition we use splitting_ops directly for
+    # partition rules
+    assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
 
-    # When attn_fusion pass enabled, splitting_ops now default to attention ops.
+    # When attn_fusion pass enabled.
     config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
@@ -189,29 +193,41 @@ def test_splitting_ops_dynamic():
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
     )
-    # With the new simplified logic, attention fusion works with splitting_ops
-    assert config.compilation_config.splitting_ops_contain_attention()
-    # cudagraph mode remains PIECEWISE
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+    assert config.compilation_config.splitting_ops == []
+    # cudagraph mode also fall back to FULL
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
 
-    # When both use_inductor_graph_partition and attn_fusion pass enabled.
-    if is_torch_equal_or_newer("2.9.0.dev"):
+    # splitting_ops can not contain attention ops when attn_fusion
+    # pass enabled.
+    with pytest.raises(ValidationError):
         config = VllmConfig(
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                use_inductor_graph_partition=True,
                 pass_config={"enable_attn_fusion": True, "enable_noop": True},
                 custom_ops=["+quant_fp8"],
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                # work around for accessing all attntion ops
+                splitting_ops=CompilationConfig()._attention_ops,
             )
         )
-        # With inductor graph partition, attn_fusion and splitting_ops
-        # work together. Default splitting_ops include attention ops.
-        assert config.compilation_config.splitting_ops_contain_attention()
-        # enable_attn_fusion is directly supported under
-        # use_inductor_graph_partition=True, and cudagraph_mode
-        # is unchanged.
-        assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+
+    # When both use_inductor_graph_partition and attn_fusion pass enabled.
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+    # With inductor graph partition, attn_fusion and splitting_ops
+    # work together. Default splitting_ops include attention ops.
+    assert config.compilation_config.splitting_ops_contain_attention()
+    # enable_attn_fusion is directly supported under
+    # use_inductor_graph_partition=True, and cudagraph_mode
+    # is unchanged.
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
 def test_should_split():
@@ -293,25 +309,36 @@ def test_should_split():
         "tp_size",
         "enable_sequence_parallelism",
         "max_num_batched_tokens",
-        "use_cudagraph",
+        "cudagraph_mode",
         "expected_max_size",
     ),
     [
-        (None, None, 1, False, 2048, True, 512),
-        ([1, 2, 4], 4, 1, False, 2048, True, 4),
-        ([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
-        ([1, 256], None, 1, False, 2048, 256),
-        ([], None, 1, False, 2048, False, 0),
-        (None, 0, 1, False, 2048, False, 0),
+        (None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        (
+            [1, 2, 4],
+            8,
+            1,
+            False,
+            2048,
+            CUDAGraphMode.FULL_AND_PIECEWISE,
+            ValidationError,
+        ),
+        ([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
+        (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
         # truncated to nearest multiple of 8 or 16
-        (None, 257, 1, False, 2048, True, 256),
-        ([1, 2, 4, 15], None, 1, False, 2048, True, 15),  # max from list
-        ([1, 2, 4, 15], None, 2, True, 2048, True, 4),  # filtered out 15 due to SP
-        ([1, 2, 4, 15], None, 1, False, 8, True, 4),  # limited by the max_tokens
+        (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        # max from list
+        ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
+        # filtered out 15 due to SP
+        ([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        # limited by the max_tokens
+        ([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
         # the list should contain at least 1 element when use cudagraph
-        ([], None, 1, False, 2048, True, RuntimeError),
+        ([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
         # the max capturing size should be >= 1 when use cudagraph
-        (None, 0, 1, False, 2048, True, RuntimeError),
+        (None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
     ],
 )
 def test_cudagraph_sizes_post_init(
@@ -320,15 +347,17 @@ def test_cudagraph_sizes_post_init(
     tp_size,
     enable_sequence_parallelism,
     max_num_batched_tokens,
-    use_cudagraph,
+    cudagraph_mode,
     expected_max_size,
 ):
     ctx = nullcontext()
-    if isinstance(expected_max_size, Exception):
+    if expected_max_size == ValidationError:
         ctx = pytest.raises(expected_max_size)
 
-    cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
-    with ctx:
+    with (
+        ctx,
+        patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
+    ):
         compilation_config = CompilationConfig(
             cudagraph_capture_sizes=cudagraph_capture_sizes,
             max_cudagraph_capture_size=max_cudagraph_capture_size,
@@ -342,11 +371,13 @@ def test_cudagraph_sizes_post_init(
         engine_args = EngineArgs(
             model="facebook/opt-125m",
             tensor_parallel_size=tp_size,
+            max_num_seqs=min(max_num_batched_tokens, 128),
             max_num_batched_tokens=max_num_batched_tokens,
             compilation_config=compilation_config,
         )
         vllm_config = engine_args.create_engine_config()
 
-    assert (
-        vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
-    )
+        assert (
+            vllm_config.compilation_config.max_cudagraph_capture_size
+            == expected_max_size
+        )
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index c9d01f2317d29..1850cc8f1479a 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -80,7 +80,6 @@ def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatc
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -215,7 +214,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -257,7 +255,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 71f90f6d8d3ee..b4e5e56ac9fe6 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -10,6 +10,7 @@ import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -184,13 +185,24 @@ def test_custom_compile_config(
     [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
 @pytest.mark.parametrize(
-    "model",
+    "model, backend",
     [
-        "Qwen/Qwen2-0.5B",  # Standard attention model
-        "deepseek-ai/DeepSeek-V2-Lite",  # MLA (Multi-head Latent Attention) model
+        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+        (
+            "deepseek-ai/DeepSeek-V2-Lite",
+            AttentionBackendEnum.FLASHINFER_MLA,
+        ),  # MLA (Multi-head Latent Attention) model
     ],
 )
-def test_fp8_kv_scale_compile(compilation_mode: int, model: str):
+def test_fp8_kv_scale_compile(
+    monkeypatch: pytest.MonkeyPatch,
+    compilation_mode: int,
+    model: str,
+    backend: AttentionBackendEnum | None,
+):
+    if backend:
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 60e01a0b0b639..ec079f11d2cd7 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -10,7 +10,7 @@ from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -105,7 +105,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
 
         # TODO(luka) use get_kv_cache_stride_order
         # Create dummy KV cache for the selected backend
-        if backend == _Backend.ROCM_ATTN:
+        if backend == AttentionBackendEnum.ROCM_ATTN:
             # k/v as 1st dimention
             # HND: [num_blocks, num_kv_heads, block_size, head_size]
             kv_cache = torch.zeros(
@@ -117,7 +117,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+        elif backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
             # k/v as 1st dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -129,7 +129,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.TRITON_ATTN:
+        elif backend == AttentionBackendEnum.TRITON_ATTN:
             # k/v as 2nd dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -141,7 +141,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.FLASHINFER:
+        elif backend == AttentionBackendEnum.FLASHINFER:
             kv_cache = torch.zeros(
                 num_blocks,
                 2,
@@ -242,8 +242,8 @@ MODELS_FP8: list[tuple[str, type]] = []
 MODELS_FP4: list[tuple[str, type]] = []
 HEADS: list[tuple[int, int]] = []
 SPLIT_ATTENTION: list[bool] = []
-BACKENDS_FP8: list[_Backend] = []
-BACKENDS_FP4: list[_Backend] = []
+BACKENDS_FP8: list[AttentionBackendEnum] = []
+BACKENDS_FP4: list[AttentionBackendEnum] = []
 
 if current_platform.is_cuda():
     HEADS = [(64, 8), (40, 8)]
@@ -259,8 +259,8 @@ if current_platform.is_cuda():
             TestAttentionNvfp4QuantPatternModel,
         )
     ]
-    BACKENDS_FP8 = [_Backend.TRITON_ATTN, _Backend.FLASHINFER]
-    BACKENDS_FP4 = [_Backend.FLASHINFER]
+    BACKENDS_FP8 = [AttentionBackendEnum.TRITON_ATTN, AttentionBackendEnum.FLASHINFER]
+    BACKENDS_FP4 = [AttentionBackendEnum.FLASHINFER]
 
 elif current_platform.is_rocm():
     HEADS = [(32, 8), (40, 8)]
@@ -268,9 +268,9 @@ elif current_platform.is_rocm():
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
     BACKENDS = [
-        _Backend.ROCM_AITER_UNIFIED_ATTN,
-        _Backend.ROCM_ATTN,
-        _Backend.TRITON_ATTN,
+        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+        AttentionBackendEnum.ROCM_ATTN,
+        AttentionBackendEnum.TRITON_ATTN,
     ]
 
 
@@ -300,11 +300,11 @@ def test_attention_quant_pattern(
     custom_ops: str,
     model_name: str,
     model_class: type[AttentionQuantPatternModel],
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     dist_init,
 ):
     """Test AttentionStaticQuantPattern fusion pass"""
-    if backend == _Backend.FLASHINFER and (
+    if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
@@ -312,6 +312,7 @@ def test_attention_quant_pattern(
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
     vllm_config = VllmConfig(
@@ -400,7 +401,7 @@ def test_attention_quant_pattern(
 
         result_fused_1 = model_compiled(q, k, v)
 
-        if backend == _Backend.FLASHINFER:
+        if backend == AttentionBackendEnum.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index 4b910bc285797..e1560efb3f247 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -11,7 +11,7 @@ from typing import Any, NamedTuple
 import pytest
 import regex as re
 
-from tests.v1.attention.utils import _Backend
+from tests.v1.attention.utils import AttentionBackendEnum
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
@@ -24,7 +24,7 @@ from ..utils import flat_product, multi_gpu_test
 class ModelBackendTestCase(NamedTuple):
     model_name: str
     model_kwargs: dict[str, Any]
-    backend: _Backend
+    backend: AttentionBackendEnum
     attention_fusions: int
     allreduce_fusions: int | None = None
 
@@ -39,14 +39,14 @@ if current_platform.is_cuda():
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=32,
             allreduce_fusions=65,
         ),
         ModelBackendTestCase(
             model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
+            backend=AttentionBackendEnum.FLASHINFER,
             attention_fusions=48,
             allreduce_fusions=96,
         ),
@@ -56,7 +56,7 @@ if current_platform.is_cuda():
         ModelBackendTestCase(
             model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
+            backend=AttentionBackendEnum.FLASHINFER,
             attention_fusions=32,
             allreduce_fusions=65,
         ),
@@ -67,14 +67,14 @@ if current_platform.is_cuda():
         ModelBackendTestCase(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=0,
             allreduce_fusions=65,
         ),
         ModelBackendTestCase(
             model_name="Qwen/Qwen3-30B-A3B",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=0,
             allreduce_fusions=97,
         ),
@@ -85,19 +85,19 @@ elif current_platform.is_rocm():
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=32,
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_ATTN,
+            backend=AttentionBackendEnum.ROCM_ATTN,
             attention_fusions=32,
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_AITER_UNIFIED_ATTN,
+            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
             attention_fusions=32,
         ),
     ]
@@ -117,7 +117,7 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
 def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     attention_fusions: int,
     allreduce_fusions: int,
     custom_ops: str,
@@ -125,7 +125,7 @@ def test_attn_quant(
     caplog_mp_spawn,
     monkeypatch,
 ):
-    if backend == _Backend.FLASHINFER and (
+    if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
@@ -208,7 +208,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 def test_tp2_attn_quant_allreduce_rmsnorm(
     model_name: str,
     model_kwargs: dict,
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     attention_fusions: int,
     allreduce_fusions: int,
     custom_ops: str,
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
new file mode 100644
index 0000000000000..511e50f5fdc24
--- /dev/null
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.compile.backend import TestBackend
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
+from vllm.compilation.qk_norm_rope_fusion import (
+    FUSED_QK_ROPE_OP,
+    QKNormRoPEFusionPass,
+)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+RSQRT_OP = torch.ops.aten.rsqrt.default
+INDEX_SELECT_OP = torch.ops.aten.index.Tensor
+
+
+class QKNormRoPETestModel(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        eps: float,
+        is_neox: bool,
+        vllm_config: VllmConfig,
+        dtype: torch.dtype,
+        prefix: str = "model.layers.0.self_attn.attn",
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = num_heads * head_dim
+        self.kv_size = num_kv_heads * head_dim
+        self.rotary_dim = head_dim
+        self.eps = eps
+        self.dtype = dtype
+
+        # Register layer metadata for the fusion pass via Attention.
+        self.attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=1.0 / self.head_dim**0.5,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix=prefix,
+            attn_type=AttentionType.DECODER,
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.rotary_emb = RotaryEmbedding(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position_embeddings=4096,
+            base=10000,
+            is_neox_style=is_neox,
+            dtype=self.dtype,
+        )
+        self.enable_rms_norm_custom_op = self.q_norm.enabled()
+        self.enable_rope_custom_op = self.rotary_emb.enabled()
+
+    def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        ops = []
+        if self.enable_rms_norm_custom_op:
+            ops.append(RMS_OP)
+        else:
+            ops.append(RSQRT_OP)
+
+        if self.enable_rope_custom_op:
+            if self.rotary_emb.use_flashinfer:
+                ops.append(FLASHINFER_ROTARY_OP)
+            else:
+                ops.append(ROTARY_OP)
+        else:
+            ops.append(INDEX_SELECT_OP)
+        return ops
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [FUSED_QK_ROPE_OP]
+
+
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("is_neox", [True, False])
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_rope_custom_op", [True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_qk_norm_rope_fusion(
+    eps, is_neox, enable_rms_norm_custom_op, enable_rope_custom_op, dtype
+):
+    if not hasattr(torch.ops._C, "fused_qk_norm_rope"):
+        pytest.skip("fused_qk_norm_rope custom op not available")
+
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    custom_ops: list[str] = []
+    if enable_rms_norm_custom_op:
+        custom_ops.append("+rms_norm")
+    if enable_rope_custom_op:
+        custom_ops.append("+rotary_embedding")
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(
+                enable_qk_norm_rope_fusion=True,
+                enable_noop=True,
+            ),
+        ),
+    )
+
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    T = 5
+
+    with set_current_vllm_config(vllm_config):
+        model = QKNormRoPETestModel(
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            eps=eps,
+            is_neox=is_neox,
+            vllm_config=vllm_config,
+            dtype=dtype,
+        )
+
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = QKNormRoPEFusionPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+
+        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+        backend_baseline = TestBackend(noop_pass, cleanup_pass)
+
+        qkv = torch.randn(T, model.q_size + 2 * model.kv_size)
+        pos = torch.arange(T, dtype=torch.long, device=qkv.device)
+        qkv_unfused = qkv.clone()
+        pos_unfused = pos.clone()
+
+        torch._dynamo.mark_dynamic(qkv, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+        model_fused = torch.compile(model, backend=backend)
+        q_fused, k_fused, v_fused = model_fused(qkv, pos)
+
+        torch._dynamo.mark_dynamic(qkv_unfused, 0)
+        torch._dynamo.mark_dynamic(pos_unfused, 0)
+        model_unfused = torch.compile(model, backend=backend_baseline)
+        q_unfused, k_unfused, v_unfused = model_unfused(qkv_unfused, pos_unfused)
+
+        if dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
+
+        assert fusion_pass.matched_count == 1
+
+        backend.check_before_ops(model.ops_in_model_before())
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py
index b1a09d88ed9d6..3d02893e52f1e 100644
--- a/tests/config/test_multimodal_config.py
+++ b/tests/config/test_multimodal_config.py
@@ -3,13 +3,13 @@
 
 import pytest
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
 
 
 def test_mm_encoder_attn_backend_str_conversion():
     config = MultiModalConfig(mm_encoder_attn_backend="FLASH_ATTN")
-    assert config.mm_encoder_attn_backend == _Backend.FLASH_ATTN
+    assert config.mm_encoder_attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def test_mm_encoder_attn_backend_invalid():
@@ -20,6 +20,6 @@ def test_mm_encoder_attn_backend_invalid():
 def test_mm_encoder_attn_backend_hash_updates():
     base_hash = MultiModalConfig().compute_hash()
     overridden_hash = MultiModalConfig(
-        mm_encoder_attn_backend=_Backend.FLASH_ATTN
+        mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
     ).compute_hash()
     assert base_hash != overridden_hash
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index d32cfde07c21e..1ed44a33bf81f 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -22,9 +22,6 @@ def monkeypatch_module():
 
 @pytest.fixture(scope="module", params=[True])
 def server(request, monkeypatch_module):
-    use_v1 = request.param
-    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-
     args = [
         "--dtype",
         "bfloat16",
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 788a1e9121825..93e11b61020c5 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -34,6 +34,9 @@ class MockConversationContext(ConversationContext):
     def append_output(self, output) -> None:
         pass
 
+    def append_tool_output(self, output) -> None:
+        pass
+
     async def call_tool(self):
         return []
 
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
new file mode 100644
index 0000000000000..48bf06088bc05
--- /dev/null
+++ b/tests/entrypoints/test_responses_utils.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.entrypoints.responses_utils import (
+    convert_tool_responses_to_completions_format,
+)
+
+
+class TestResponsesUtils:
+    """Tests for convert_tool_responses_to_completions_format function."""
+
+    def test_convert_tool_responses_to_completions_format(self):
+        """Test basic conversion of a flat tool schema to nested format."""
+        input_tool = {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        }
+
+        result = convert_tool_responses_to_completions_format(input_tool)
+
+        assert result == {"type": "function", "function": input_tool}
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 8149ce7672cdc..3b8e939300a27 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -35,7 +35,7 @@ DEVICE_MLA_BACKENDS = {
 DEVICE_REGULAR_ATTN_BACKENDS = {
     "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
-    "cpu": ["TORCH_SDPA"],
+    "cpu": ["CPU_ATTN"],
 }
 
 DEVICE_MLA_BLOCK_SIZES = {
@@ -86,7 +86,7 @@ def test_env(
         if device == "cpu":
             with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
-            assert backend.get_name() == "TORCH_SDPA"
+            assert backend.get_name() == "CPU_ATTN"
 
         elif device == "hip":
             with patch("vllm.platforms.current_platform", RocmPlatform()):
@@ -120,12 +120,13 @@ def test_env(
 
         elif device == "cuda":
             with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
                 if use_mla:
                     # CUDA MLA backend logic:
                     # - CUTLASS_MLA: only supported with block_size == 128
-                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    #   and Blackwell GPUs (SM 10.x), V1 only
                     # - FLASHINFER_MLA: only supported on Blackwell GPUs
-                    #   (SM 10.0+), V1 only
+                    #   (SM 10.x), V1 only
                     # - FLASHMLA: only supported with block_size == 64
                     # - FLASH_ATTN_MLA: V1 only
                     # - TRITON_MLA: fallback for other cases
@@ -134,58 +135,72 @@ def test_env(
                         if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
                             pytest.skip("CUTLASS_MLA only supports block_size 128")
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "CUTLASS_MLA"
-                            assert backend.get_name() == expected
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
                         if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
                                 "FlashInfer MLA only supports block_size 32 or 64"
                             )
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "FLASHINFER_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHMLA":
                         if block_size != 64:
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
-                        else:
-                            from vllm.v1.attention.backends.mla.flashmla import (
-                                is_flashmla_dense_supported,
-                            )
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
 
-                            is_supported, _ = is_flashmla_dense_supported()
-                            if not is_supported:
-                                pytest.skip("FlashMLA not supported on this platform")
-                            else:
-                                backend = get_attn_backend(
-                                    16, torch.float16, None, block_size, use_mla=use_mla
-                                )
-                                expected = name
-                                assert backend.get_name() == expected
-                    elif name == "FLASH_ATTN_MLA":
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
+                    elif name == "FLASH_ATTN_MLA":
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "FLASH_ATTN_MLA"
                         assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
                     backend = get_attn_backend(
-                        16, torch.float16, None, block_size, use_mla=use_mla
+                        64, torch.float16, None, block_size, use_mla=use_mla
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
@@ -209,7 +224,7 @@ def test_fp32_fallback(device: str):
     if device == "cpu":
         with patch("vllm.platforms.current_platform", CpuPlatform()):
             backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "TORCH_SDPA"
+        assert backend.get_name() == "CPU_ATTN"
 
     elif device == "cuda":
         with patch("vllm.platforms.current_platform", CudaPlatform()):
diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
new file mode 100644
index 0000000000000..fb3b1799ba48e
--- /dev/null
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -0,0 +1,575 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+
+NUM_HEADS = [
+    (4, 4),
+    (8, 2),
+    (9, 3),
+]
+HEAD_SIZES = [96, 128]
+QTYPES = [torch.bfloat16, torch.half, torch.float32]
+SLIDING_WINDOWS = [None, 256]
+NUM_BLOCKS = [
+    1024,
+]
+SEQ_LENS = [  # (q_len, kv_len)
+    [(1, 213), (1, 1), (1, 312), (1, 7), (1, 7812)],  # decode batch
+    [(2345, 2345), (5, 5), (3, 16), (134, 5131)],  # prefill batch
+    [(992, 2456), (1, 1234), (98, 1145), (1, 4162), (2345, 2345)],  # mixed batch
+]
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+
+    return tensor
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes.float()
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+    dtype = query.dtype
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+
+    if alibi_slopes is not None:
+        alibi_slopes = alibi_slopes[:, None, None]
+
+    if s_aux is not None:
+        s_aux = s_aux.float()
+        s_aux = s_aux[:, None, None]
+
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len].float()
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len].float()
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len].float()
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+
+        if alibi_slopes is not None:
+            q_start_pos = kv_len - query_len
+            q_pos = q_start_pos + torch.arange(0, query_len)[None, :, None]
+            kv_pos = torch.arange(0, kv_len)[None, None, :]
+            dist = q_pos - kv_pos
+            alibi_bias = -alibi_slopes * dist
+            attn += alibi_bias
+
+        attn.masked_fill_(mask, float("-inf"))
+
+        if s_aux is not None:
+            s_aux_ext = s_aux.repeat(1, query_len, 1)
+            attn = torch.cat((s_aux_ext, attn), dim=-1)
+
+        attn = torch.softmax(attn, dim=-1)
+
+        if s_aux is not None:
+            attn = attn[:, :, 1:]
+
+        out = torch.einsum("hqk,khd->qhd", attn, v).to(dtype=dtype)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@torch.inference_mode()
+def varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    # for n heads the set of slopes is the geometric sequence that starts
+    # 2^(-8/n)
+    alibi_slopes = _get_alibi_slopes(num_query_heads) if use_alibi else None
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=False,
+    )
+
+    out_without_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_without_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=True,
+    )
+
+    out_with_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_with_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+        alibi_slopes=alibi_slopes,
+        s_aux=s_aux,
+    )
+
+    atol, rtol = 1.5e-2, 1e-2
+    (
+        torch.testing.assert_close(out_with_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_with_split - ref_output))}",
+    )
+    (
+        torch.testing.assert_close(out_without_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_without_split - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec"])
+def test_varlen_with_paged_kv_normal_vec(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["amx"])
+@pytest.mark.skipif(
+    not torch._C._cpu._is_amx_tile_supported(), reason="no AMX support."
+)
+def test_varlen_with_paged_kv_normal_amx(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [48])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec16"])
+def test_varlen_with_paged_kv_normal_vec16(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [50])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_softcap(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [True])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_alibi(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [True])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_sink(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 14d1618bca3c5..183bbf3bf4e03 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -11,7 +11,7 @@ from unittest.mock import patch
 import pytest
 import torch
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
@@ -43,14 +43,14 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
         with (
             patch("vllm.attention.layer.current_platform", RocmPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
@@ -59,7 +59,7 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA not available
@@ -73,7 +73,7 @@ def test_mha_attn_platform(device: str):
             ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.XFORMERS
+            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA available
@@ -96,7 +96,7 @@ def test_mha_attn_platform(device: str):
             ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def ref_attention(
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 65972d02f2f66..78cdbbbf7379d 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -8,10 +8,8 @@ from collections.abc import Callable
 
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+import torch.nn.functional as F
 
-from tests.kernels.utils import make_alibi_bias
 from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
@@ -28,6 +26,74 @@ KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
 
 
+def create_causal_attention_mask_for_sdpa(
+    query_lens: list[int],
+    seq_lens: list[int],
+    sliding_window: int = 0,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+) -> torch.Tensor:
+    total_queries = sum(query_lens)
+    total_keys = sum(seq_lens)
+
+    # Create a mask filled with -inf
+    mask = torch.full(
+        (total_queries, total_keys), float("-inf"), device=device, dtype=dtype
+    )
+
+    query_start = 0
+    key_start = 0
+
+    for query_len, seq_len in zip(query_lens, seq_lens):
+        query_end = query_start + query_len
+        key_end = key_start + seq_len
+        q_indices = torch.arange(query_len, device=device)
+        k_indices = torch.arange(seq_len, device=device)
+        q_pos_in_seq = seq_len - query_len + q_indices
+
+        valid_mask = k_indices[None, :] <= q_pos_in_seq[:, None]
+
+        if sliding_window > 0:
+            valid_mask &= k_indices[None, :] >= (
+                q_pos_in_seq[:, None] - sliding_window + 1
+            )
+
+        mask[query_start:query_end, key_start:key_end][valid_mask] = 0.0
+
+        query_start = query_end
+        key_start = key_end
+
+    return mask
+
+
+def create_alibi_causal_mask(
+    query_len: int,
+    seq_len: int,
+    alibi_slopes: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    query_pos = torch.arange(
+        seq_len - query_len, seq_len, device=device, dtype=torch.float32
+    )
+    key_pos = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+    rel_pos = key_pos[None, :] - query_pos[:, None]
+
+    # Apply ALiBi slopes: [num_heads, query_len, seq_len]
+    alibi_bias = alibi_slopes[:, None, None] * rel_pos[None, :, :]
+    alibi_bias = alibi_bias.to(dtype)
+
+    # Apply causal mask: prevent attending to future positions
+    # causal_mask[i, j] = True if key_pos[j] <= query_pos[i]
+    causal_mask = key_pos[None, :] <= query_pos[:, None]
+    alibi_bias = alibi_bias.masked_fill(~causal_mask[None, :, :], float("-inf"))
+
+    # Add batch dimension: [1, num_heads, query_len, seq_len]
+    # SDPA expects batch dimension even for single sequences
+    return alibi_bias.unsqueeze(0)
+
+
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -52,6 +118,13 @@ def test_contexted_kv_attention(
             "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
         )
 
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -96,16 +169,16 @@ def test_contexted_kv_attention(
     )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
     b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -189,56 +262,57 @@ def test_contexted_kv_attention(
 
     scale = float(1.0 / (head_size**0.5))
 
-    attn_op = xops.fmha.cutlass.FwOp()
-
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        query = query.view(
-            query.shape[0], num_kv_heads, num_queries_per_kv, query.shape[-1]
-        )
-        key = key[:, :, None, :].expand(
-            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
-        )
-        value = value[:, :, None, :].expand(
-            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
-        )
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
-
-    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens
+    # Reshape for SDPA: (seq_len, num_heads, head_size) ->
+    # (1, num_heads, seq_len, head_size)
+    query_sdpa = query.view(num_tokens, num_kv_heads, num_queries_per_kv, head_size)
+    query_sdpa = query_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, num_tokens, head_size
     )
-    if sliding_window > 0:
-        attn_bias = attn_bias.make_local_attention_from_bottomright(sliding_window)
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+
+    # Expand key and value for GQA/MQA to match query heads
+    key_sdpa = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    key_sdpa = key_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    value_sdpa = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+    value_sdpa = value_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    attn_mask = create_causal_attention_mask_for_sdpa(
+        query_lens, seq_lens, sliding_window, device=device, dtype=dtype
+    )
+
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     start_time = time.time()
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
-    output_ref = output_ref.reshape(output.shape)
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
+
+    # Reshape output back to (num_tokens, num_heads, head_size)
+    output_ref = output_ref.view(num_heads, num_tokens, head_size)
+    output_ref = output_ref.permute(1, 0, 2).contiguous()
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
@@ -265,6 +339,13 @@ def test_contexted_kv_attention_alibi(
             "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
         )
 
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -331,16 +412,16 @@ def test_contexted_kv_attention_alibi(
     )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
     b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -423,78 +504,75 @@ def test_contexted_kv_attention_alibi(
     print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
     scale = float(1.0 / (head_size**0.5))
 
-    # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
-    # we have to pad query tensor before MQA/GQA expanding.
-    if query.shape[0] != key.shape[0]:
-        query_pad = torch.empty(sum(seq_lens), num_heads, head_size, dtype=dtype)
-        query_pad.uniform_(-1e-3, 1e-3)
-        seq_start = 0
-        query_start = 0
-        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-            seq_end = seq_start + seq_len
-            query_end = query_start + query_len
-            query_pad[seq_start:seq_end, ...] = torch.cat(
-                [
-                    torch.zeros(seq_len - query_len, num_heads, head_size, dtype=dtype),
-                    query[query_start:query_end, ...],
-                ],
-                dim=0,
-            )
-            seq_start += seq_len
-            query_start += query_len
-        query = query_pad
+    # Prepare query, key, value for SDPA
+    # Expand key and value for GQA/MQA to match query heads
+    key_expanded = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    value_expanded = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
 
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        key = key[:, :, None, :].expand(
-            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
-        )
-        value = value[:, :, None, :].expand(
-            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
-        )
-        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
-        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
-        # codebase. We save some time reshaping alibi matrix at runtime.
-        key = key.reshape(key.shape[0], -1, key.shape[-1])
-        value = value.reshape(value.shape[0], -1, value.shape[-1])
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
-
-    attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
     output_ref = torch.empty_like(output)
-    seq_start = 0
-    query_start = 0
+
+    torch.cuda.synchronize()
     start_time = time.time()
-    # Attention with alibi slopes.
-    # FIXME(DefTruth): Because xformers does not support dynamic sequence
-    # lengths with custom attention bias, we process each prompt one by
-    # one. This is inefficient, especially when we have many short prompts.
-    # modified from: vllm/v1/attention/backends/xformers.py#L343
+
+    query_start = 0
+    key_start = 0
     for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-        seq_end = seq_start + seq_len
         query_end = query_start + query_len
-        out = xops.memory_efficient_attention_forward(
-            query[:, seq_start:seq_end],
-            key[:, seq_start:seq_end],
-            value[:, seq_start:seq_end],
-            attn_bias=attn_bias[i],
-            p=0.0,
+        key_end = key_start + seq_len
+
+        # Get query, key, value for this sequence
+        q = query[query_start:query_end]  # [query_len, num_heads, head_size]
+        k = key_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+        v = value_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+
+        # Reshape for SDPA: (batch=1, num_heads, seq_len, head_size)
+        q_sdpa = q.view(query_len, num_kv_heads, num_queries_per_kv, head_size)
+        q_sdpa = (
+            q_sdpa.permute(1, 2, 0, 3)
+            .reshape(1, num_heads, query_len, head_size)
+            .contiguous()
+        )
+
+        k_sdpa = (
+            k.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+        v_sdpa = (
+            v.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+
+        # Create ALiBi causal mask for this sequence using utility function
+        alibi_mask = create_alibi_causal_mask(
+            query_len, seq_len, alibi_slopes, device, dtype
+        )
+
+        # Compute attention
+        out = F.scaled_dot_product_attention(
+            q_sdpa,
+            k_sdpa,
+            v_sdpa,
+            attn_mask=alibi_mask,
+            dropout_p=0.0,
             scale=scale,
         )
-        out = out.view_as(query[:, seq_start:seq_end]).view(
-            seq_len, num_heads, head_size
-        )
-        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len :, ...])
-        seq_start += seq_len
-        query_start += query_len
+
+        # Reshape output back to [query_len, num_heads, head_size]
+        out = out.view(num_heads, query_len, head_size).permute(1, 0, 2)
+        output_ref[query_start:query_end].copy_(out)
+
+        query_start = query_end
+        key_start = key_end
+
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
new file mode 100644
index 0000000000000..a23959e353da9
--- /dev/null
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+IS_NEOX = [True, False]
+EPS_VALUES = [1e-5, 1e-6]
+SEEDS = [13]
+CUDA_DEVICES = ["cuda:0"]
+
+
+def _apply_qk_norm_rope(
+    qkv: torch.Tensor,
+    positions: torch.Tensor,
+    q_norm: RMSNorm,
+    k_norm: RMSNorm,
+    rope: RotaryEmbedding,
+    num_heads_q: int,
+    num_heads_kv: int,
+    head_dim: int,
+) -> torch.Tensor:
+    q_size = num_heads_q * head_dim
+    kv_size = num_heads_kv * head_dim
+
+    q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+
+    q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim)
+    q_by_head = q_norm.forward_native(q_by_head)
+    q = q_by_head.view(q.shape)
+
+    k_by_head = k.view(*k.shape[:-1], k.shape[-1] // head_dim, head_dim)
+    k_by_head = k_norm.forward_native(k_by_head)
+    k = k_by_head.view(k.shape)
+
+    q, k = rope.forward_native(positions, q, k)
+    return torch.cat([q, k, v], dim=-1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="fused_qk_norm_rope custom op requires cuda and rocm platform",
+)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("is_neox", IS_NEOX)
+@pytest.mark.parametrize("eps", EPS_VALUES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_fused_qk_norm_rope_matches_reference(
+    device: str,
+    dtype: torch.dtype,
+    is_neox: bool,
+    eps: float,
+    seed: int,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    num_tokens = 4
+
+    total_dim = (num_heads + 2 * num_kv_heads) * head_dim
+    qkv_base = torch.randn(num_tokens, total_dim, dtype=dtype, device=device)
+    qkv_fused = qkv_base.clone()
+    positions = torch.arange(num_tokens, dtype=torch.long, device=device)
+
+    q_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    k_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    q_norm.weight.data.normal_(mean=1.0, std=0.1)
+    k_norm.weight.data.normal_(mean=1.0, std=0.1)
+    q_weight = q_norm.weight.data
+    k_weight = k_norm.weight.data
+
+    rope = RotaryEmbedding(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position_embeddings=4096,
+        base=10000.0,
+        is_neox_style=is_neox,
+        dtype=dtype,
+    ).to(device)
+
+    ref_result = _apply_qk_norm_rope(
+        qkv=qkv_base,
+        positions=positions,
+        q_norm=q_norm,
+        k_norm=k_norm,
+        rope=rope,
+        num_heads_q=num_heads,
+        num_heads_kv=num_kv_heads,
+        head_dim=head_dim,
+    )
+
+    opcheck(
+        torch.ops._C.fused_qk_norm_rope,
+        (
+            qkv_fused.clone(),
+            num_heads,
+            num_kv_heads,
+            num_kv_heads,
+            head_dim,
+            eps,
+            q_weight,
+            k_weight,
+            rope.cos_sin_cache,
+            is_neox,
+            positions.view(-1),
+        ),
+    )
+
+    torch.ops._C.fused_qk_norm_rope(
+        qkv_fused,
+        num_heads,
+        num_kv_heads,
+        num_kv_heads,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        rope.cos_sin_cache,
+        is_neox,
+        positions.view(-1),
+    )
+
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(
+        qkv_fused,
+        ref_result,
+        atol=ATOL,
+        rtol=RTOL,
+    )
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 21eeffb1c7264..d79fdfbe07af3 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -6,6 +6,10 @@ import torch
 
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts,
 )
@@ -21,7 +25,6 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedTritonExperts,
     NaiveBatchedExperts,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, TritonExperts
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP,
 )
@@ -399,9 +402,7 @@ def make_prepare_finalize(
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEPrepareAndFinalize:
     if backend != "naive" and backend is not None:
-        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(
-            moe, quant_config
-        )
+        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
         assert prepare_finalize is not None
         return prepare_finalize
     elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index dc6557b93f050..15ff6d5364136 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -8,6 +8,7 @@ import torch
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 from vllm.platforms import current_platform
+from vllm.utils.platform_utils import get_cu_count
 
 DTYPES = [torch.bfloat16, torch.float16]
 # Specific (N, K, M) combinations for targeted testing
@@ -85,7 +86,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     A = torch.rand(n, k, dtype=dtype, device="cuda") - 0.5
     B = torch.rand(m, k, dtype=dtype, device="cuda") - 0.5
@@ -102,7 +103,7 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
     A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
@@ -121,7 +122,7 @@ def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
     A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
@@ -153,7 +154,14 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     ref_out = torch._scaled_mm(
         A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
     )
-    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, current_platform.get_cu_count())
+    out = ops.wvSplitKQ(
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+    )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
 
@@ -180,7 +188,13 @@ def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
         A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, bias=BIAS
     )
     out = ops.wvSplitKQ(
-        B, A, dtype, scale_a, scale_b, current_platform.get_cu_count(), BIAS
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+        BIAS,
     )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index c9eca1f86d3a1..4e681ca6ac831 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Integration tests for FlexAttention backend vs default backend"""
 
 import pytest
 import torch
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
deleted file mode 100644
index 4b0bbb992d2ee..0000000000000
--- a/tests/kernels/test_triton_flash_attention.py
+++ /dev/null
@@ -1,516 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the triton_flash_attention kernel
-
-Run `pytest tests/kernels/test_triton_flash_attention.py`.
-"""
-
-import pytest
-import torch
-
-from vllm.attention.ops.triton_flash_attention import (
-    SUPPORTED_LAYOUTS,
-    MetaData,
-    compute_alibi_tensor,
-    scale_fp8,
-    triton_attention_rocm,
-)
-from vllm.platforms import current_platform
-
-
-class ReferenceAttention:
-    def __init__(
-        self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
-    ):
-        self.Z = Z
-        self.HQ = HQ
-        self.HK = HK
-        self.N_CTX_Q = N_CTX_Q
-        self.N_CTX_K = N_CTX_K
-        self.D_HEAD = D_HEAD
-        self.use_alibi = use_alibi
-        self.dtype = dtype
-        self.input_metadata = input_metadata
-
-    def fwd(self, q, k, v):
-        scores = (
-            torch.einsum("bhqd,bhkd->bhqk", q, k).float() * self.input_metadata.sm_scale
-        )
-        if self.input_metadata.causal:
-            mask = torch.tril(
-                torch.ones(self.N_CTX_Q, self.N_CTX_K, device="cuda"),
-                diagonal=self.N_CTX_K - self.N_CTX_Q,
-            )
-            scores[:, :, mask == 0] = float("-inf")
-
-        if self.input_metadata.bias is not None:
-            scores += self.input_metadata.bias
-
-        if self.use_alibi:
-            scores += compute_alibi_tensor(
-                self.input_metadata.alibi_slopes, self.N_CTX_Q, self.N_CTX_K
-            )
-
-        p = torch.softmax(scores, dim=-1)
-        if self.input_metadata.causal:
-            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
-            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
-            # So we fix this by converting the NaNs to 0s, which is what they
-            # should be out of the softmax.
-            nan_mask = torch.isnan(p)
-            p[nan_mask == 1] = 0
-        ref_out = torch.einsum("bhqk,bhkd->bhqd", p.to(self.dtype), v)
-        # compare
-        if self.input_metadata.layout == "bshd":
-            ref_out = ref_out.transpose(1, 2).clone()
-        return ref_out
-
-    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
-        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-            self.dtype
-        )
-        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-            self.dtype
-        )
-        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-            self.dtype
-        )
-        result = self.fwd(q, k, v)
-        if self.input_metadata.o_scale is not None:
-            result, _ = scale_fp8(result, self.input_metadata.o_scale)
-        return result
-
-    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-        k_descale, v_descale = (
-            self.input_metadata.k_descale,
-            self.input_metadata.v_descale,
-        )
-        k_dequantized = (
-            k_quantized.to(torch.float32) * k_descale.to(torch.float32)
-        ).to(self.dtype)
-        v_dequantized = (
-            v_quantized.to(torch.float32) * v_descale.to(torch.float32)
-        ).to(self.dtype)
-        return self.fwd(q, k_dequantized, v_dequantized)
-
-    def varlen_fwd(self, q, k, v, is_mqa=False):
-        ref_out = torch.empty_like(q)
-        if is_mqa:
-            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
-            # the size aligns with Q.
-            k_ref = k.view(k.shape[0], k.shape[1], 1, k.shape[2]).expand(
-                -1, -1, self.HQ // self.HK, -1
-            )
-            v_ref = v.view(v.shape[0], v.shape[1], 1, v.shape[2]).expand(
-                -1, -1, self.HQ // self.HK, -1
-            )
-        else:
-            k_ref = k
-            v_ref = v
-
-        for i in range(0, self.input_metadata.num_contexts):
-            start_q, start_k = (
-                self.input_metadata.cu_seqlens_q[i],
-                self.input_metadata.cu_seqlens_k[i],
-            )
-            end_q, end_k = (
-                self.input_metadata.cu_seqlens_q[i + 1],
-                self.input_metadata.cu_seqlens_k[i + 1],
-            )
-            k_curr = k_ref[start_k:end_k]
-            v_curr = v_ref[start_k:end_k]
-            if is_mqa:
-                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
-                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
-            scores = torch.einsum("qhd,khd->qhk", q[start_q:end_q], k_curr).float()
-            p = torch.softmax(scores * self.input_metadata.sm_scale, dim=-1).half()
-            ref_out[start_q:end_q] = torch.einsum("qhk,khd->qhd", p, v_curr)
-        return ref_out
-
-
-def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
-    q_descale = None
-    if not fp8_kv:
-        q, q_descale = scale_fp8(q)
-    k, k_descale = scale_fp8(k)
-    v, v_descale = scale_fp8(v)
-
-    # In real world use case, the p scale would be a parameter trained by the
-    # model.
-    p_scale = None
-
-    o_scale = torch.rand(1, device="cuda", requires_grad=False) if use_o_scale else None
-
-    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
-
-
-def input_helper(
-    Z,
-    HQ,
-    HK,
-    N_CTX_Q,
-    N_CTX_K,
-    D_HEAD,
-    dtype,
-    layout=None,
-    use_alibi=None,
-    causal=None,
-    is_fp8=False,
-    fp8_kv=False,
-    use_o_scale=False,
-    use_bias=False,
-):
-    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    current_platform.seed_everything(0)
-
-    # Initialize q, k, v
-    if layout == "bhsd":
-        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
-        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
-    elif layout == "bshd":
-        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
-        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
-
-    if use_alibi:
-        # for n heads the set of slopes is the geometric sequence that starts
-        # 2^(-8/n)
-        alibi_slopes = torch.tensor(
-            [2 ** (-8 / HQ * i) for i in range(1, HQ + 1)],
-            dtype=torch.float32,
-            device="cuda",
-        ).repeat(Z, 1)
-    else:
-        alibi_slopes = None
-
-    if use_bias:
-        bias = torch.randn(
-            (1, HQ, N_CTX_Q, N_CTX_K), dtype=dtype, device="cuda", requires_grad=False
-        )
-    else:
-        bias = None
-
-    q = torch.randn(q_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-    k = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-    v = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-
-    if is_fp8:
-        (q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale) = quantize_input(
-            q, k, v, use_o_scale=use_o_scale, fp8_kv=fp8_kv
-        )
-    else:
-        q_descale = k_descale = v_descale = p_scale = o_scale = None
-
-    input_metadata = MetaData(
-        sm_scale=D_HEAD**-0.5,
-        max_seqlens_q=N_CTX_Q,
-        max_seqlens_k=N_CTX_K,
-        layout=layout,
-        alibi_slopes=alibi_slopes,
-        alibi_batch=Z,
-        alibi_nheads=HQ,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-        p_scale=p_scale,
-        o_scale=o_scale,
-        bias=bias,
-        seqlen_q=N_CTX_Q,
-        seqlen_k=N_CTX_K,
-    )
-    return q, k, v, input_metadata
-
-
-def varlen_input_helper(
-    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, equal_seqlens=False
-):
-    current_platform.seed_everything(0)
-
-    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
-    # seqs
-    if not equal_seqlens:
-        max_seqlens_q = N_CTX_Q // Z
-        max_seqlens_k = N_CTX_K // Z
-        seqlens_q = torch.randint(1, max_seqlens_q + 1, (Z,), dtype=torch.int32)
-        seqlens_k = torch.randint(1, max_seqlens_k + 1, (Z,), dtype=torch.int32)
-    else:
-        seqlens_q = torch.full((Z,), N_CTX_Q // Z)
-        seqlens_k = torch.full((Z,), N_CTX_K // Z)
-
-    # Calculate cumulative sequence lengths
-    cu_seqlens_q = torch.cat(
-        [
-            torch.tensor([0], dtype=torch.int32),
-            seqlens_q.cumsum(dim=0, dtype=torch.int32),
-        ]
-    )
-    cu_seqlens_k = torch.cat(
-        [
-            torch.tensor([0], dtype=torch.int32),
-            seqlens_k.cumsum(dim=0, dtype=torch.int32),
-        ]
-    )
-    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
-    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
-
-    # Initialize q, k, v with variable lengths
-    total_q = cu_seqlens_q[-1].item()
-    total_k = cu_seqlens_k[-1].item()
-    q = (
-        torch.randn((total_q, HQ, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    k = (
-        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    v = (
-        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    sm_scale = D_HEAD**-0.5
-    input_metadata = MetaData(sm_scale=sm_scale)
-    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-    return q, k, v, input_metadata
-
-
-@pytest.mark.parametrize(
-    "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (1, 48, 12, 1, 1, 64),
-        (4, 4, 4, 128, 128, 65),
-        (16, 48, 48, 1, 1, 128),
-        (64, 48, 24, 3, 3, 128),
-        (4, 4, 4, 113, 123, 1),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("use_alibi", [True, False])
-@pytest.mark.parametrize("layout", ["bshd"])
-def test_op_fwd(
-    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_alibi, layout, dtype=torch.float16
-):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(
-        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, layout, use_alibi, causal
-    )
-
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    # Transpose here if layout is bshd so we have same reference code for all
-    # layouts
-    if layout == "bshd":
-        q = q.transpose(1, 2).clone()
-        k = k.transpose(1, 2).clone()
-        v = v.transpose(1, 2).clone()
-    # Replicate K and V if using MQA/GQA
-    if HQ != HK:
-        k = (
-            k.view(k.shape[0], k.shape[1], -1, k.shape[2], k.shape[3])
-            .expand(-1, -1, HQ // HK, -1, -1)
-            .reshape(k.shape[0], -1, k.shape[2], k.shape[3])
-        )
-        v = (
-            v.view(v.shape[0], v.shape[1], -1, v.shape[2], v.shape[3])
-            .expand(-1, -1, HQ // HK, -1, -1)
-            .reshape(v.shape[0], -1, v.shape[2], v.shape[3])
-        )
-
-    ref_impl = ReferenceAttention(
-        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd(q, k, v)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("layout", ["bhsd"])
-@pytest.mark.parametrize("use_o_scale", [True, False])
-@pytest.mark.skipif(
-    torch.cuda.get_device_capability() < (9, 0),
-    reason="Triton FP8 requires CUDA 9.0 or higher",
-)
-def test_op_fwd_fp8(
-    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, use_o_scale, dtype=torch.float32
-):
-    current_platform.seed_everything(0)
-
-    # Disable grad to save memory it won't run into OOM on CI machine.
-    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
-    # dtype, layout)
-
-    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        causal=causal,
-        layout=layout,
-        is_fp8=True,
-        use_o_scale=use_o_scale,
-    )
-
-    o = torch.empty_like(q_quantized) if use_o_scale else None
-
-    tri_out, _ = triton_attention_rocm(
-        q_quantized, k_quantized, v_quantized, o, input_metadata
-    )
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
-
-    # compare
-    torch.testing.assert_close(
-        ref_out.to(torch.float32), tri_out.to(torch.float32), atol=7e-2, rtol=2e-1
-    )
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-        (4, 4, 113, 123, 1),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("layout", ["bhsd"])
-def test_op_fwd_fp8_kv(
-    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, dtype=torch.float32
-):
-    current_platform.seed_everything(0)
-
-    q, k_quantized, v_quantized, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        causal=causal,
-        layout=layout,
-        is_fp8=True,
-        fp8_kv=True,
-    )
-
-    o = torch.empty_like(q)
-
-    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("use_bias", [True])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        layout="bhsd",
-        causal=causal,
-        use_bias=use_bias,
-    )
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd(q, k, v)
-
-    # compare
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize(
-    "Z, H, N_CTX, D_HEAD",
-    [(1, 48, 256, 64), (4, 48, 512, 64), (16, 48, 512, 64), (64, 48, 128, 128)],
-)
-@pytest.mark.parametrize("causal", [True, False])
-def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX, D_HEAD, dtype)
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize(
-    "Z, HQ, HK, N_CTX, D_HEAD",
-    [
-        (2, 48, 24, 128, 64),
-        (4, 48, 12, 256, 64),
-        (4, 48, 4, 512, 64),
-        (4, 64, 16, 128, 128),
-    ],
-)
-@pytest.mark.parametrize("causal", [False])
-def test_op_varlen_mqa_fwd(Z, HQ, HK, N_CTX, D_HEAD, causal, dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(
-        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, dtype
-    )
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index ad37d1ad82c03..0cdb7c9a603f2 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -38,7 +38,11 @@ AITER_MODEL_LIST = [
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.slow_test,
+                pytest.mark.cpu_model,
+            ],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -55,6 +59,10 @@ AITER_MODEL_LIST = [
                 pytest.mark.slow_test,
             ],
         ),
+        pytest.param(
+            "google/gemma-2-2b-it",  # test hybrid attention
+            marks=[pytest.mark.cpu_model],
+        ),
         pytest.param(
             "zai-org/chatglm3-6b",  # chatglm (text-only)
         ),
@@ -64,7 +72,6 @@ AITER_MODEL_LIST = [
         ),
         pytest.param(
             "openbmb/MiniCPM3-4B",
-            # fused_moe not supported on CPU
             marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
         ),
         pytest.param(
@@ -93,11 +100,7 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model],
-        ),
-        pytest.param(
-            "allenai/OLMoE-1B-7B-0924-Instruct",
-            marks=[pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
     ],
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 471826f214d0c..2723bb21de97b 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -27,13 +27,7 @@ def test_models(
     example_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index c8deffbf66dba..93b9843311d33 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -4,7 +4,6 @@
 import pytest
 
 from vllm.config import PoolerConfig
-from vllm.platforms import current_platform
 
 from ...utils import check_embeddings_close
 
@@ -23,8 +22,7 @@ from ...utils import check_embeddings_close
         ),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
-            # CPU v1 doesn't support sliding window
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
             "ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
@@ -52,13 +50,7 @@ def test_models(
     vllm_runner,
     example_prompts,
     model,
-    monkeypatch,
 ) -> None:
-    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["pooler_config"] = PoolerConfig(
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index 91be6cd09d33e..2482452645ef1 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -2,18 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.config.pooler import PoolerConfig
-from vllm.platforms import current_platform
 
 
 def test_idefics_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -59,13 +52,7 @@ def update_config(config):
 
 def test_gemma_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     messages = [
         {
             "role": "system",
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 46504d025c265..c42186c7db9a3 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -76,7 +76,6 @@ def test_prm_models(
     math_step_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
     check_transformers_version(
         "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
@@ -85,11 +84,6 @@ def test_prm_models(
     if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
 
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.reward(math_step_prompts)
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 4c79ac318ffbe..5504c417fda4c 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -5,16 +5,17 @@ image, embedding, and video support for different VLMs in vLLM.
 """
 
 import math
-import os
 from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
+from packaging.version import Version
 from transformers import (
     AutoModel,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
 from vllm.utils.func_utils import identity
@@ -38,13 +39,6 @@ from .vlm_utils.types import (
     VLMTestType,
 )
 
-# This hack is needed for phi3v & paligemma models
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 COMMON_BROADCAST_SETTINGS = {
     "test_type": VLMTestType.IMAGE,
     "dtype": "half",
@@ -859,6 +853,12 @@ VLM_TEST_SETTINGS = {
                 limit_mm_per_prompt={"image": 4},
             )
         ],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.1"),
+                reason="This model is broken in Transformers v4.57.1",
+            )
+        ],
     ),
     # regression test for https://github.com/vllm-project/vllm/issues/15122
     "qwen2_5_vl-windows-attention": VLMTestInfo(
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
index cbc7dfca0234d..62456221711ed 100644
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -11,7 +11,6 @@ from huggingface_hub import snapshot_download
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
 
 from ....conftest import (
     IMAGE_ASSETS,
@@ -46,12 +45,6 @@ models = [model_path]
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 5619cecc081d2..665bed512392c 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -14,7 +14,6 @@ from vllm.assets.image import ImageAsset
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
-from vllm.platforms import current_platform
 
 from ....conftest import (
     IMAGE_ASSETS,
@@ -68,12 +67,6 @@ def vllm_to_hf_output(
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 1a7d854352ae6..6b009075abfa7 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -61,10 +61,8 @@ def test_qwen2_5_vl_evs_functionality(
         model,
         runner="generate",
         max_model_len=4000,
-        max_num_seqs=1,
         dtype=dtype,
         limit_mm_per_prompt={"video": 1},
-        tensor_parallel_size=1,
         video_pruning_rate=video_pruning_rate,
     ) as vllm_model:
         # Generate output - this should not crash
diff --git a/tests/models/quantization/test_gpt_oss_attn_quantization.py b/tests/models/quantization/test_gpt_oss_attn_quantization.py
new file mode 100644
index 0000000000000..780165ea2ba7a
--- /dev/null
+++ b/tests/models/quantization/test_gpt_oss_attn_quantization.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test attention quantization of gpt-oss model.
+The qkv_proj and o_proj in self_attention can be either quantized or excluded.
+
+Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import huggingface_hub
+import lm_eval
+import pytest
+from packaging import version
+
+MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"]
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+def has_huggingface_access(repo):
+    try:
+        huggingface_hub.list_repo_refs(repo)
+        return True
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        return False
+
+
+HF_HUB_AMD_ORG_ACCESS = all(
+    [has_huggingface_access(model_name) for model_name in MODEL_NAMES]
+)
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False"
+        )
+
+
+EXPECTED_ACCURACIES = {"arc_challenge": 0.20}
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not HF_HUB_AMD_ORG_ACCESS,
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
+@pytest.mark.parametrize("model_name", MODEL_NAMES)
+@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items())
+def test_gpt_oss_attention_quantization(
+    model_name: str, task_name: str, expected_accuracy: float
+):
+    measured_accuracy = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=task_name,
+        batch_size="auto",
+    )["results"][task_name]["acc,none"]
+
+    rtol = 0.05
+    assert (
+        measured_accuracy - rtol < expected_accuracy
+        and measured_accuracy + rtol > expected_accuracy
+    ), f"Expected: {expected_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8f19a048677ec..644d0619215fb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -243,7 +243,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "FalconH1ForCausalLM": _HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
     "FlexOlmoForCausalLM": _HfExamplesInfo("allenai/Flex-reddit-2x7B-1T"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
-    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma2ForCausalLM": _HfExamplesInfo(
+        "google/gemma-2-9b", extras={"tiny": "google/gemma-2-2b-it"}
+    ),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it"),
     "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 48a6f34366cff..8c4bd6eaa2dd8 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -93,6 +93,17 @@ def can_initialize(
             "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
         )
 
+    if model_arch == "DeepseekV32ForCausalLM":
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability and capability.major < 9:
+            pytest.skip(
+                f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
+                f"for FLASHMLA_SPARSE backend. Current device has compute "
+                f"capability {capability.major}.{capability.minor}"
+            )
+
     with (
         patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
         monkeypatch.context() as m,
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 0389e28746cbb..a80617a366cab 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -27,7 +27,6 @@ class DummyPlatform(Platform):
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/tests/quantization/test_mixed_precision.py b/tests/quantization/test_mixed_precision.py
new file mode 100755
index 0000000000000..51526470b4233
--- /dev/null
+++ b/tests/quantization/test_mixed_precision.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test quark-quantized {MXFP4, FP8} mixed precision models.
+
+Run `pytest tests/quantization/test_mixed_precision.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False"
+        )
+
+
+TEST_CONFIGS = {
+    # Mixed-precision (AMP) model
+    # - Demonstrates end-to-end pipeline functionality
+    "amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8": {"arc_challenge": 0.52, "mmlu": 0.72},
+    # Non-mixed-precision (PTQ) model
+    # - Reference for pipeline compatibility verification -> No conflicts or breakings
+    "amd/Llama-2-70b-chat-hf-FP8-MLPerf-fp8_attn_quark_format": {
+        "arc_challenge": 0.53,
+        "mmlu": 0.61,
+    },
+}
+
+
+@pytest.mark.parametrize("model_name, accuracy_numbers", TEST_CONFIGS.items())
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+def test_mixed_precision_model_accuracies(model_name: str, accuracy_numbers: dict):
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=list(accuracy_numbers.keys()),
+        batch_size=8,
+    )
+
+    rtol = 0.05
+
+    for task, expect_accuracy in accuracy_numbers.items():
+        measured_accuracy = results["results"][task]["acc,none"]
+        assert (
+            measured_accuracy - rtol < expect_accuracy
+            and measured_accuracy + rtol > expect_accuracy
+        ), f"Expected: {expect_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 0af27aff9359d..334f9a65e4c03 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -8,7 +8,6 @@ See also `tests/kernels/moe/test_ocp_mx_moe.py`.
 """
 
 import importlib.metadata
-import os
 from dataclasses import dataclass
 from importlib.util import find_spec
 
@@ -246,8 +245,6 @@ def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
     task = "gsm8k"
     rtol = 0.03
 
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=config.get_model_args(tp_size=8, model_max_len=38768),
@@ -263,8 +260,6 @@ def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
         and measured_value + rtol > EXPECTED_VALUE
     ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
-    del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
-
 
 @pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 51684edcc8a30..3310753d2b6d6 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -8,6 +8,7 @@ import os
 import pytest
 import yaml
 from transformers import AutoTokenizer
+from pydantic import ValidationError
 
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
 
@@ -376,6 +377,65 @@ def test_load_config_file(tmp_path):
     os.remove(str(config_file_path))
 
 
+def test_compilation_mode_string_values(parser):
+    """Test that -O.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-O.mode", "0"])
+    assert args.compilation_config == {"mode": 0}
+
+    args = parser.parse_args(["-O3"])
+    assert args.compilation_config == {"mode": 3}
+
+    args = parser.parse_args(["-O.mode=NONE"])
+    assert args.compilation_config == {"mode": "NONE"}
+
+    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
+
+    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    assert args.compilation_config == {"mode": "VLLM_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=none"])
+    assert args.compilation_config == {"mode": "none"}
+
+    args = parser.parse_args(["-O.mode=vllm_compile"])
+    assert args.compilation_config == {"mode": "vllm_compile"}
+
+
+def test_compilation_config_mode_validator():
+    """Test that CompilationConfig.mode field validator converts strings to integers."""
+    from vllm.config.compilation import CompilationConfig, CompilationMode
+
+    config = CompilationConfig(mode=0)
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode=3)
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="NONE")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="STOCK_TORCH_COMPILE")
+    assert config.mode == CompilationMode.STOCK_TORCH_COMPILE
+
+    config = CompilationConfig(mode="DYNAMO_TRACE_ONCE")
+    assert config.mode == CompilationMode.DYNAMO_TRACE_ONCE
+
+    config = CompilationConfig(mode="VLLM_COMPILE")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="none")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="vllm_compile")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    with pytest.raises(ValidationError, match="Invalid compilation mode"):
+        CompilationConfig(mode="INVALID_MODE")
+
+
 def test_flat_product():
     # Check regular itertools.product behavior
     result1 = list(flat_product([1, 2, 3], ["a", "b"]))
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 08aeb6f298f61..b46002c5fa8ff 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -15,7 +15,7 @@ from tests.v1.attention.utils import (
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
@@ -27,11 +27,11 @@ from vllm.v1.attention.backends.utils import (
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN,
-    _Backend.FLASHINFER,
-    _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN,
-    _Backend.TREE_ATTN,
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLASHINFER,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
+    AttentionBackendEnum.TREE_ATTN,
     "FLEX_ATTENTION_SLOW",
 ]
 
@@ -39,7 +39,7 @@ BACKENDS_TO_TEST = [
 try:
     import flashinfer  # noqa: F401
 except ImportError:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER)
 
 
 def _convert_dtype_to_torch(dtype):
@@ -192,7 +192,7 @@ class MockAttentionLayer:
 
 
 def run_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
@@ -211,13 +211,13 @@ def run_attention_backend(
 
     use_direct_block_mask = is_torch_equal_or_newer("2.9.0.dev0")
     if backend == "FLEX_ATTENTION_SLOW":
-        actual_backend = _Backend.FLEX_ATTENTION
+        actual_backend = AttentionBackendEnum.FLEX_ATTENTION
         use_direct_block_mask = False
 
     builder_cls, impl_cls = try_get_attention_backend(actual_backend)
 
     # Mock flashinfer's get_per_layer_parameters if needed
-    if actual_backend == _Backend.FLASHINFER:
+    if actual_backend == AttentionBackendEnum.FLASHINFER:
         import unittest.mock
 
         from vllm.v1.attention.backends.utils import PerLayerParameters
@@ -246,7 +246,7 @@ def run_attention_backend(
     else:
         # Build metadata
         builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
-        if actual_backend == _Backend.FLEX_ATTENTION:
+        if actual_backend == AttentionBackendEnum.FLEX_ATTENTION:
             builder.direct_build = use_direct_block_mask
         attn_metadata = builder.build(
             common_prefix_len=0,
@@ -289,7 +289,7 @@ def run_attention_backend(
 def _test_backend_correctness(
     batch_spec: BatchSpec,
     model: str,
-    backend_to_test: list[_Backend | str],
+    backend_to_test: list[AttentionBackendEnum | str],
     mask_mod,
     *,
     block_size: int = 16,
@@ -455,17 +455,20 @@ def _test_backend_correctness(
         # Select the appropriate KV cache format for each backend
         kv_cache_for_backend = kv_cache
         reset_kv_cache_layout = False
-        if backend_name in (_Backend.FLASHINFER, _Backend.TRITON_ATTN):
+        if backend_name in (
+            AttentionBackendEnum.FLASHINFER,
+            AttentionBackendEnum.TRITON_ATTN,
+        ):
             kv_cache_for_backend = kv_cache.transpose(0, 1)
 
-        if backend_name == _Backend.FLASHINFER:
+        if backend_name == AttentionBackendEnum.FLASHINFER:
             # For FlashInfer default to HND layout and
             kv_cache_for_backend = (
                 kv_cache_for_backend.transpose(2, 3).contiguous().transpose(2, 3)
             )
             set_kv_cache_layout("HND")
             reset_kv_cache_layout = True
-        elif backend_name == _Backend.TRITON_ATTN:
+        elif backend_name == AttentionBackendEnum.TRITON_ATTN:
             kv_cache_for_backend = kv_cache_for_backend.contiguous()
 
         try:
@@ -547,7 +550,9 @@ def test_causal_backend_correctness(
 
     batch_spec = BATCH_SPECS[batch_spec_name]
     LARGE_BLOCK_BACKENDS = (
-        [_Backend.FLEX_ATTENTION] if is_torch_equal_or_newer("2.9.0.dev0") else []
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
     )
     SMALL_BLOCK_BACKENDS = [
         x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
@@ -573,9 +578,9 @@ def test_causal_backend_correctness(
 
 
 SLIDING_WINDOW_BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN,
-    _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN,
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
     "FLEX_ATTENTION_SLOW",
 ]
 
@@ -612,7 +617,9 @@ def test_sliding_window_backend_correctness(
     )
 
     LARGE_BLOCK_BACKENDS = (
-        [_Backend.FLEX_ATTENTION] if is_torch_equal_or_newer("2.9.0.dev0") else []
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
     )
     SMALL_BLOCK_BACKENDS = [
         x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 5679fafe63ee8..1bd05e6183dc2 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -18,12 +18,11 @@ from tests.v1.attention.utils import (
     try_get_attention_backend,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.backends.registry import _Backend, backend_to_class_str
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.config.vllm import set_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
@@ -31,25 +30,25 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA,
-    _Backend.FLASHMLA,
-    _Backend.FLASH_ATTN_MLA,
-    _Backend.FLASHINFER_MLA,
-    _Backend.TRITON_MLA,
+    AttentionBackendEnum.CUTLASS_MLA,
+    AttentionBackendEnum.FLASHMLA,
+    AttentionBackendEnum.FLASH_ATTN_MLA,
+    AttentionBackendEnum.FLASHINFER_MLA,
+    AttentionBackendEnum.TRITON_MLA,
 ]
 
 # Remove sm100 backends from the list if not using sm100
 if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10:
-    BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA)
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.CUTLASS_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER_MLA)
 
 # Remove FLASH_ATTN_MLA from the list if not supported
 if not flash_attn_supports_mla():
-    BACKENDS_TO_TEST.remove(_Backend.FLASH_ATTN_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASH_ATTN_MLA)
 
 # Remove FLASHMLA from the list if not supported
 if not is_flashmla_dense_supported()[0]:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHMLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHMLA)
 
 SPEC_DECODE_BACKENDS = []
 for backend in BACKENDS_TO_TEST:
@@ -62,9 +61,7 @@ for backend in BACKENDS_TO_TEST:
 
 BACKEND_BLOCK_SIZES = {}
 for backend in BACKENDS_TO_TEST:
-    backend_class_str = backend_to_class_str(backend)
-    backend_class = resolve_obj_by_qualname(backend_class_str)
-    supported_sizes = backend_class.get_supported_kernel_block_size()
+    supported_sizes = backend.get_class().supported_kernel_block_sizes
     if supported_sizes:
         default_size = supported_sizes[0]
         block_size = (
@@ -291,7 +288,7 @@ class MockMLAAttentionLayer(AttentionLayerBase):
 
 
 def run_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
@@ -813,7 +810,7 @@ def test_backend_correctness(
         # Create a summary for the single-line failure message
         backend_names = []
         for f in failures:
-            if "[_Backend." in f:
+            if "[AttentionBackendEnum." in f:
                 backend_name = f.split("[")[1].split("]")[0]
                 backend_names.append(backend_name)
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index b166d9d4ff688..dea89babd4b47 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 
 from vllm.attention.backends.abstract import AttentionImpl
-from vllm.attention.backends.registry import _Backend, backend_to_class_str
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
@@ -20,7 +20,6 @@ from vllm.config import (
     VllmConfig,
 )
 from vllm.config.model import ModelDType
-from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
@@ -120,15 +119,14 @@ def create_common_attn_metadata(
 
 
 def try_get_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
 ) -> tuple[type[AttentionMetadataBuilder], type[AttentionImpl]]:
     """Try to get the attention backend class, skipping test if not found."""
-    backend_class_str = backend_to_class_str(backend)
     try:
-        backend_class = resolve_obj_by_qualname(backend_class_str)
+        backend_class = backend.get_class()
         return backend_class.get_builder_cls(), backend_class.get_impl_cls()
     except ImportError as e:
-        pytest.skip(f"{backend_class_str} not available: {e}")
+        pytest.skip(f"{backend.name} not available: {e}")
         raise AssertionError("unreachable") from None
 
 
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
new file mode 100644
index 0000000000000..b4805be802723
--- /dev/null
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import uuid
+
+import pytest
+
+from vllm.config import VllmConfig
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
+from vllm.sampling_params import SamplingParams
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.request import Request
+
+from .test_scheduler import create_scheduler_with_priority
+from .utils import EOS_TOKEN_ID
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _create_random_request(
+    max_tokens_range: tuple[int, int],
+    num_tokens_range: tuple[int, int],
+    arrival_time_range: tuple[float, float],
+    priority_range: tuple[int, int],
+    num_mm_item_range: tuple[int, int],
+    vllm_config: VllmConfig,
+):
+    max_tokens = random.randint(*max_tokens_range)
+    num_tokens = random.randint(*num_tokens_range)
+    priority = random.randint(*priority_range)
+    arrival_time = random.uniform(*arrival_time_range)
+    num_mm_item = random.randint(*num_mm_item_range)
+
+    mm_positions: list[PlaceholderRange] = []
+    for mm_start in sorted(
+        random.sample(range(num_tokens), min(num_mm_item, num_tokens))
+    ):
+        if mm_start + 10 > num_tokens:
+            continue
+        mm_positions.append(PlaceholderRange(offset=mm_start, length=10))
+
+    request_id = uuid.uuid4().hex
+
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+    )
+    mm_features = []
+    for j, position in enumerate(mm_positions):
+        identifier = f"{request_id}_hash_{j}"
+        mm_feature = MultiModalFeatureSpec(
+            data=MultiModalKwargsItem.dummy("dummy_m"),
+            mm_position=position,
+            identifier=identifier,
+            modality="image",
+        )
+        mm_features.append(mm_feature)
+
+    prompt_token_ids = random.choices(range(100), k=num_tokens)
+
+    caching_hash_fn = get_hash_fn_by_name(
+        vllm_config.cache_config.prefix_caching_hash_algo
+    )
+    init_none_hash(caching_hash_fn)
+    block_hasher = get_request_block_hasher(
+        vllm_config.cache_config.block_size, caching_hash_fn
+    )
+
+    request = Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        mm_features=mm_features if mm_features else None,
+        eos_token_id=EOS_TOKEN_ID,
+        arrival_time=arrival_time,
+        priority=priority,
+        block_hasher=block_hasher,
+    )
+    return request
+
+
+def _mock_execute_model(
+    scheduler_output: SchedulerOutput, num_output_tokens_range: tuple[int, int]
+) -> ModelRunnerOutput:
+    request_ids: list[str] = []
+    request_ids.extend(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    request_ids.extend(scheduler_output.scheduled_cached_reqs.req_ids)
+    random.shuffle(request_ids)
+
+    num_output_tokens = [
+        random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
+    ]
+    sampled_token_ids = [
+        [random.randint(0, 100) for _ in range(num_tokens)]
+        for num_tokens in num_output_tokens
+    ]
+
+    return ModelRunnerOutput(
+        req_ids=request_ids,
+        req_id_to_index={req_id: i for i, req_id in enumerate(request_ids)},
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+
+def _mock_draft_token_ids(
+    scheduler_output: SchedulerOutput,
+    num_output_tokens_range: tuple[int, int],
+    seen_request_prompt_length: dict[str, int],
+) -> DraftTokenIds:
+    request_ids: list[str] = []
+    sampled_token_ids: list[list[int]] = []
+    for request in scheduler_output.scheduled_new_reqs:
+        assert request.req_id not in seen_request_prompt_length
+        seen_request_prompt_length[request.req_id] = len(request.prompt_token_ids or [])
+        if request.num_computed_tokens >= seen_request_prompt_length[request.req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(request.req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    for req_id, num_computed_tokens in zip(
+        scheduler_output.scheduled_cached_reqs.req_ids,
+        scheduler_output.scheduled_cached_reqs.num_computed_tokens,
+    ):
+        if num_computed_tokens >= seen_request_prompt_length[req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
+
+
+def _chech_valid_scheduler_output(
+    scheduler_output: SchedulerOutput,
+    seen_request_ids: set[str],
+    seen_mm_hashes: set[str],
+):
+    for req in scheduler_output.scheduled_new_reqs:
+        assert req.req_id not in seen_request_ids
+        seen_request_ids.add(req.req_id)
+    for req_id in scheduler_output.scheduled_cached_reqs.req_ids:
+        assert req_id in seen_request_ids
+
+    req_ids = set[str]()
+    req_ids.update(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    req_ids.update(scheduler_output.scheduled_cached_reqs.req_ids)
+
+    assert set(scheduler_output.num_scheduled_tokens.keys()) == req_ids
+    assert (
+        sum(scheduler_output.num_scheduled_tokens.values())
+        == scheduler_output.total_num_scheduled_tokens
+    )
+
+    assert set(scheduler_output.scheduled_spec_decode_tokens.keys()) <= req_ids
+    assert set(scheduler_output.scheduled_encoder_inputs.keys()) <= req_ids
+
+    for req in scheduler_output.scheduled_new_reqs:
+        for mm_feature in req.mm_features:
+            seen_mm_hashes.add(mm_feature.identifier)
+    for mm_hash in scheduler_output.free_encoder_mm_hashes:
+        assert mm_hash in seen_mm_hashes
+
+    assert scheduler_output.finished_req_ids <= seen_request_ids
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+@pytest.mark.parametrize("num_speculative_tokens", [None, 1, 5])
+@pytest.mark.parametrize(
+    ("max_input_tokens", "max_output_tokens", "max_num_seqs", "num_blocks"),
+    [
+        # Standard profile
+        (5000, 500, 256, 10000),
+        # Generation heavy + high max_num_seqs + low num_blocks -> Many preemptions
+        (500, 5000, 1024, 1000),
+    ],
+    ids=["standard", "preemption"],
+)
+def test_priority_scheduling_blast(
+    enable_prefix_caching: bool,
+    num_speculative_tokens: int | None,
+    max_input_tokens: int,
+    max_output_tokens: int,
+    max_num_seqs: int,
+    num_blocks: int,
+):
+    random.seed(42)
+    seen_request_prompt_length = dict[str, int]()
+    seen_request_ids = set[str]()
+    seen_mm_hashes = set[str]()
+
+    scheduler = create_scheduler_with_priority(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        max_num_seqs=max_num_seqs,
+        enable_prefix_caching=enable_prefix_caching,
+        num_blocks=num_blocks,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    num_initial_requests = 10
+    for _ in range(num_initial_requests):
+        req = _create_random_request(
+            max_tokens_range=(1, max_output_tokens),
+            num_tokens_range=(1, max_input_tokens),
+            arrival_time_range=(0, 1),
+            priority_range=(-3, 3),
+            num_mm_item_range=(0, 2),
+            vllm_config=scheduler.vllm_config,
+        )
+        scheduler.add_request(req)
+
+    for _ in range(20000):
+        if len(scheduler.waiting) == 0:
+            num_new_requests = random.randint(0, 2)
+            for _ in range(num_new_requests):
+                req = _create_random_request(
+                    max_tokens_range=(1, max_output_tokens),
+                    num_tokens_range=(1, max_input_tokens),
+                    arrival_time_range=(0, 1),
+                    priority_range=(-3, 3),
+                    num_mm_item_range=(0, 2),
+                    vllm_config=scheduler.vllm_config,
+                )
+                scheduler.add_request(req)
+        scheduler_output = scheduler.schedule()
+        _chech_valid_scheduler_output(
+            scheduler_output, seen_request_ids, seen_mm_hashes
+        )
+        model_output = _mock_execute_model(
+            scheduler_output,
+            num_output_tokens_range=(1, 1 + (num_speculative_tokens or 0)),
+        )
+        scheduler.update_from_output(scheduler_output, model_output)
+        if num_speculative_tokens is not None:
+            scheduler.update_draft_token_ids(
+                _mock_draft_token_ids(
+                    scheduler_output,
+                    (0, num_speculative_tokens),
+                    seen_request_prompt_length,
+                )
+            )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 749cf7dc8397e..d5b829e79b8f7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,6 +8,7 @@ import torch
 
 from vllm.config import (
     CacheConfig,
+    ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
     SchedulerConfig,
@@ -20,6 +21,9 @@ from vllm.multimodal.inputs import (
     PlaceholderRange,
 )
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.utils.hashing import sha256
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (
@@ -872,7 +876,10 @@ def _step_until_done(
         for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
             # We should be in the decode phase now.
             assert num_scheduled_tokens == 1
-        assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.connector is not None:
+            assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.ec_connector is not None:
+            assert len(output.ec_connector_metadata.mm_datas) == 0
         ecos = scheduler.update_from_output(output, model_runner_output)[0]
         all_done = True
         for eco in ecos.outputs:
@@ -1066,7 +1073,10 @@ def test_external_prefix_cache_metrics():
     assert external_stats.preempted_requests == 0
 
 
-def test_kv_connector_unable_to_allocate():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1080,6 +1090,9 @@ def test_kv_connector_unable_to_allocate():
         use_kv_connector=True,
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
@@ -1148,7 +1161,10 @@ def test_kv_connector_unable_to_allocate():
     assert len(scheduler.waiting) == 0
 
 
-def test_kv_connector_handles_preemption():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1163,6 +1179,9 @@ def test_kv_connector_handles_preemption():
         use_kv_connector=True,
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
@@ -1379,6 +1398,8 @@ def create_scheduler_with_priority(
     block_size: int = 16,
     max_model_len: int | None = None,
     num_speculative_tokens: int | None = None,
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
 ) -> Scheduler:
     """Create scheduler with priority policy enabled.
 
@@ -1439,12 +1460,23 @@ def create_scheduler_with_priority(
             model="ngram", num_speculative_tokens=num_speculative_tokens
         )
 
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
@@ -1465,16 +1497,23 @@ def create_scheduler_with_priority(
     )
 
 
+_none_hash_initialized = False
+
+
 def create_requests_with_priority(
     num_requests: int,
     priorities: list[int],
     arrival_times: list[float] | None = None,
     num_tokens: int = 10,
+    mm_hashes_list: list[list[str]] | None = None,
     mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
     stop_token_ids: list[int] | None = None,
     prompt_logprobs: int | None = None,
     starting_idx: int = 0,
+    same_prompt: bool = False,
+    block_size: int = 16,
+    req_ids: list[str] | None = None,
 ):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
@@ -1483,6 +1522,12 @@ def create_requests_with_priority(
     else:
         arrival_times = [float(i) for i in range(num_requests)]
 
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(sha256)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, sha256)
     sampling_params = SamplingParams(
         ignore_eos=False,
         max_tokens=max_tokens,
@@ -1490,29 +1535,70 @@ def create_requests_with_priority(
         prompt_logprobs=prompt_logprobs,
     )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i + starting_idx}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
-                identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image",
-                )
-                mm_features.append(mm_feature)
 
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
+                identifier = f"hash{i}_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
+
+        prompt_token_ids = (
+            [starting_idx] * num_tokens
+            if same_prompt
+            else [i + starting_idx] * num_tokens
+        )
         request = Request(
-            request_id=f"{i + starting_idx}",
-            prompt_token_ids=[i + starting_idx] * num_tokens,
+            request_id=req_ids[i],
+            prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
             mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             arrival_time=arrival_times[i],
             priority=priorities[i],
+            block_hasher=block_hasher,
         )
         requests.append(request)
     return requests
@@ -1999,7 +2085,12 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(scheduler.waiting) == 1
 
 
-def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
+    use_ec_connector, ec_role
+):
     """Test that priority scheduling preempts lower priority requests
     when out of KV cache space."""
     # Create scheduler with very limited memory to force preemption
@@ -2009,6 +2100,9 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
         num_blocks=5,  # Can hold 64 tokens (first block is null)
         block_size=16,  # Standard block size
         use_kv_connector=True,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     # Create a request and schedule it
@@ -2168,3 +2262,976 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
         assert scheduler_config.disable_chunked_mm_input is not expect_enabled
     if is_encoder_decoder and not expect_enabled:
         assert scheduler_config.long_prefill_token_threshold == 0
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests start
+# NOTE: In E->P->D disagg case, both KV and EC Connector works in P instance
+# Unless specify, the existence of KV Connector should not affect any test results
+# ==============================================================================
+
+
+def _assert_right_encoder_cache_allocated(
+    scheduler: Scheduler,
+    hashes_to_check: list[str] | None = None,
+    requests: list[Request] | None = None,
+    expected_total_allocated: int | None = None,
+):
+    """Check whether encoder cache is allocated correctly."""
+    encoder_cache_manager = scheduler.encoder_cache_manager
+
+    # Verify encoder cache manager exists
+    assert encoder_cache_manager is not None, "Encoder cache manager should exist"
+
+    # Verify number of cache
+    if expected_total_allocated is not None:
+        assert len(encoder_cache_manager.cached) == expected_total_allocated
+        if expected_total_allocated == 0:
+            return
+
+    # Verify each request with MM data is in cache
+    cached_hashes = set(encoder_cache_manager.cached.keys())
+
+    if hashes_to_check:
+        missed_hashes = set(hashes_to_check) - cached_hashes
+        assert not missed_hashes, (
+            f"Miss hashes: {missed_hashes} "
+            f"Existing encoder cache: {encoder_cache_manager.cached}"
+        )
+
+    for req in requests if requests is not None else []:
+        if req.mm_features:
+            mm_hashes = [f.identifier for f in req.mm_features]
+            req_hashes = set(mm_hashes)  # unique hashes set
+            missed_hashes = req_hashes - cached_hashes
+            assert not missed_hashes, (
+                f"Miss hashes in cache for request {req.request_id}: {missed_hashes} "
+                f"Existing encoder cache: {encoder_cache_manager.cached}"
+            )
+
+
+def _assert_right_ec_connector_metadata(
+    output: SchedulerOutput,
+    mm_features_list: list[MultiModalFeatureSpec],
+):
+    """Verify that ECConnector metadata EXACTLY matches the input MM data"""
+    # Get the connector metadata
+    metadata = output.ec_connector_metadata
+
+    # Create lookup dictionaries for efficient access
+    metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas}
+
+    # Check all required identifiers exist in metadata; and no extra
+    # In ECSharedStorageConnector format
+    # NOTE: even having same identifier, the mm_features can be different
+    # since their mm_position can be in different offsets, etc
+    identifiers_dict = {f.identifier for f in mm_features_list}
+    assert set(metadata_dict.keys()) == identifiers_dict
+
+    # Verify the info matches
+    for i, mm_feature in enumerate(mm_features_list):
+        identifier = mm_feature.identifier
+        assert metadata_dict[identifier].mm_hash == identifier
+        assert metadata_dict[identifier].num_token == mm_feature.mm_position.length
+
+
+def _assert_right_encoder_inputs(
+    output: SchedulerOutput,
+    check_exist: bool | None = True,
+    requests: list[Request] | None = None,
+    expected_encoder_inputs: list[list[int]] | None = None,
+    expected_total_reqs: int | None = None,
+):
+    """Verify that requests/mm_hashes should (not) in scheduled encoder input
+    If check_exist is False, this function returns True
+    if requests are NOT in encoder inputs"""
+
+    # Get the scheduled encoder inputs
+    # NOTE: scheduled_encoder_inputs is a dictionary with request id as key
+    scheduled_encoder_inputs = output.scheduled_encoder_inputs
+
+    # Check if scheduled_encoder_inputs is empty as expected
+    if expected_total_reqs is not None:
+        assert len(scheduled_encoder_inputs) == expected_total_reqs
+        if expected_total_reqs == 0:
+            return
+
+    # Number of expected enocder inputs should match number of requests
+    if expected_encoder_inputs:
+        assert check_exist and requests is not None  # only support expect input exist
+        assert len(requests) == len(expected_encoder_inputs)
+
+    # Check request (not) exist as expected
+    for i, request in enumerate(requests if requests is not None else []):
+        assert (request.request_id in scheduled_encoder_inputs) is check_exist, (
+            f"Request {request.id} presence mismatch: expected {check_exist}, "
+            f"got {request.id in scheduled_encoder_inputs}"
+        )
+        if expected_encoder_inputs:
+            scheduled_encoder_input = scheduled_encoder_inputs[request.request_id]
+            assert scheduled_encoder_input == expected_encoder_inputs[i]
+
+
+def test_scheduler_no_ec_connector_by_default():
+    """Test scheduler doesn't have EC connector by default."""
+    scheduler = create_scheduler()
+    assert scheduler.ec_connector is None
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_text_only_request(use_kv_connector):
+    """Test text-only requests don't allocate encoder cache."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    NUM_PROMPT_TOKENS = 100
+
+    # Create text-only request (no mm_positions)
+    requests = create_requests(
+        num_requests=1,
+        num_tokens=NUM_PROMPT_TOKENS,
+    )
+    assert not requests[0].mm_features  # No MM data
+
+    scheduler.add_request(requests[0])
+    output = scheduler.schedule()
+
+    # Should schedule
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Scheduled tokens should equal prompt tokens exactly
+    scheduled = output.num_scheduled_tokens[requests[0].request_id]
+    assert scheduled == NUM_PROMPT_TOKENS, (
+        f"Text-only should schedule {NUM_PROMPT_TOKENS}, got {scheduled}"
+    )
+
+    # Encoder cache should be empty
+    _assert_right_encoder_cache_allocated(scheduler, expected_total_allocated=0)
+
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_hit_external_load(use_kv_connector):
+    """Test ec_consumer loads from external cache when hit.
+    A normal basic operation for EPD disaggrgation"""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS = 200  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS = 100
+    mm_hashes_list = [["hash_test1"]]
+    mm_positions = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS)]]
+
+    request = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )[0]
+
+    # Mock cache hit - encoder cache exists externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+
+    # Should schedule prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request.request_id]
+    assert scheduled_tokens == NUM_TOKENS
+
+    # Should called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request, 0)
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request])
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(output, mm_features_list=request.mm_features)
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_miss_computes_locally(use_kv_connector):
+    """Test consumer can compute encoder locally when cache miss (fallback)."""
+    # encoder cache itself if it doesn't receive it from external storage
+
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Verify consumer role
+    assert scheduler.ec_connector is not None
+    assert not scheduler.ec_connector.is_producer
+
+    # Create MM request
+    request_mm_missed = create_requests(
+        num_requests=1,
+        num_tokens=200,  # Total (including 100 MM)
+        mm_positions=[[PlaceholderRange(offset=0, length=100)]],  # 100 MM tokens
+    )[0]
+
+    # Mock cache miss - encoder cache doesn't exist externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    scheduler.add_request(request_mm_missed)
+    output = scheduler.schedule()
+
+    # SCHEDULER should decide to compute encoder locally (fallback)
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Should schedule full prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request_mm_missed.request_id]
+    assert scheduled_tokens == 200, (
+        f"Expected 200 tokens on cache miss, got {scheduled_tokens}"
+    )
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_mm_missed])
+
+    # ECConnector should carry no metadata (missed cache)
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input contain mm for request_mm_missed
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request_mm_missed],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item
+        expected_total_reqs=1,
+    )
+
+    # Then MODEL_RUNNER will execute the encoder and cache the result
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
+    """Test consumer with partial cache hit (local & connector) with 2 requests."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS_1 = 300  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 50
+    mm_hashes_list_1 = [["hash1_A", "hash1_B", "hash1_A", "hash1_F"]]
+    mm_positions_1 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_1),
+        ]
+    ]
+
+    # Create request with 4 MM items, with 2 identical items
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: 1st and 3rd missing, 2nd and 4th exist
+    scheduler.ec_connector.has_caches = Mock(return_value=[False, True, False, True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    output = scheduler.schedule()
+
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+
+    # Encoder cache should contain all mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request1])
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called()
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for 2nd and 4th mm item
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request1.mm_features[1], request1.mm_features[3]]
+    )
+
+    # Should schedule ONLY 1 encoder input (index 0), no repeat for identical items
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item ONLY
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id],
+        req_id_to_index={request1.request_id: 0},
+        sampled_token_ids=[[100]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+
+    # Create another request with 4 MM items
+    NUM_TOKENS_2 = 400
+    NUM_ENCODER_TOKENS_2 = 50
+    mm_hashes_list_2 = [["hash1_C", "hash1_D", "hash1_E", "hash1_A"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: only hash1_A and hash1_C exist in connector
+    scheduler.ec_connector.has_caches = Mock(return_value=[True, False, False, True])
+
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    assert scheduled_tokens == 400
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # Should call update_state_after_alloc for hash1_C, ONLY
+    # hash1_A should not be loaded from connector
+    # since it's computed in last request & exist in local cache
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 0)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash1_C only (index 0)
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0]]
+    )
+
+    # Should schedule 2 encoder input hash1_D and hash1_E (index 1, 2)
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request2],
+        expected_encoder_inputs=[[1, 2]],
+        expected_total_reqs=1,
+    )
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_seqs=10,  # allow multiple requests
+        max_num_batched_tokens=2048,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+    mm_hashes_list = [[f"hash_{i}"] for i in range(10)]
+    mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    if cache_exist == "local":
+        # Allocate cache to cache manager manually to mimick
+        for req in requests:
+            scheduler.encoder_cache_manager.allocate(req, 0)
+    else:
+        # Make sure local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    ## Encoder-cache-specific checks:
+    # mm_hashes of requests exist in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=requests)
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            requests[-1], 0
+        )
+
+        # Concat mm_features for the 10 requests together
+        mm_features_list = [feature for req in requests for feature in req.mm_features]
+
+        # Check metadata should contain mm data for all 10 requests
+        _assert_right_ec_connector_metadata(output, mm_features_list=mm_features_list)
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=requests,
+            expected_encoder_inputs=[[0] for _ in range(10)],
+            expected_total_reqs=10,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_unable_to_allocate(use_kv_connector):
+    """
+    Test whether scheduler with ECConnector is able to handle
+    unable to allocate (run out of blocks).
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock ec_connector load external cache behavior
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=[["hash_1"], ["hash_2"]],
+        mm_positions=[
+            [PlaceholderRange(offset=1, length=10)] for _ in range(NUM_REQUESTS)
+        ],
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Setup MODEL_RUNNER_OUTPUT to be run in _step_until_done later
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # update_state_after_alloc should be called for loading external cache
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_priority_scheduling_ec_connector_preemption_and_resumption(
+    cache_exist, use_kv_connector
+):
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        max_num_seqs=2,  # allow multiple requests
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        num_blocks=15,  # can hold 244 tokens with 14 blocks (first block is null)
+        block_size=16,  # standard block size
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock cache hit: Both cache exist in connector (at E->PD initially)
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create a request and schedule it (and to be preempted)
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=94,
+        mm_hashes_list=[["hash_low"]],
+        # NOTE: this test only preempt the last block.
+        # Setting mm_position at the last block can force to recompute encoding
+        mm_positions=[[PlaceholderRange(offset=82, length=10)]],
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    # 1st schedule
+    output = scheduler.schedule()
+
+    assert len(output.scheduled_new_reqs) == 1
+    scheduled_tokens = output.num_scheduled_tokens[request_low.request_id]
+    assert scheduled_tokens == 94
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_low, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_low.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 1st decode
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[[100]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=128,
+        mm_hashes_list=[["hash_high"]],
+        mm_positions=[[PlaceholderRange(offset=1, length=10)]],
+        max_tokens=2,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    # 2nd schedule
+    output = scheduler.schedule()
+
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_high])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_high, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_high.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 2nd decode
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[[100] for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # 3rd schedule - - this should trigger preemption
+    # req_low needs 96 tokens = 6 blocks
+    # req_high needs 129 tokens = 9 blocks
+    # so doesn't fit in 14 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert scheduler.requests[request_low.request_id].status == RequestStatus.PREEMPTED
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # request_high is in decode phase now
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 3rd decode, after req_low was preempted
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[[100], [100, 200]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    # Finish the requests to make room for the preempted requests to resume
+    # req_high is finished after outputing 2 tokens
+    scheduler.update_from_output(output, model_output)
+    scheduler.finish_requests(
+        request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED
+    )
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    # By default, the cache should still exist in local in this test case
+    if cache_exist != "local":
+        # Make local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    # 4th Schedule - this should trigger req_low resumption from waiting
+    output = scheduler.schedule()
+    scheduled_cached_reqs = output.scheduled_cached_reqs
+    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
+
+    assert len(output.scheduled_new_reqs) == 0
+    assert scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Preempted request resumed in scheduled_cached_reqs
+    assert len(resumed_from_preemption) == 1
+    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
+    assert resumed_from_preemption[0]
+    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
+    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    ## Resumed tokens include 94 prompt tokens and 2 decoded tokens
+    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
+    assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
+    assert scheduler.running[0].request_id == request_low.request_id
+    assert request_high.request_id in output.finished_req_ids
+
+    ## Encoder-cache-specific checks:
+    # mm_hash of request_low exists in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            request_low, 0
+        )
+        _assert_right_ec_connector_metadata(
+            output, mm_features_list=request_low.mm_features
+        )
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=[request_low],
+            expected_encoder_inputs=[[0]],
+            expected_total_reqs=1,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connector):
+    """
+    Scenario:
+      - Encoder cache size: 32
+      - Request A: 1 feature (12 tokens) → NOT cached remotely.
+      - Request B: 3 features (3 x 10 tokens) → ALL cached remotely.
+
+    Steps:
+      1. Schedule Request A (locally uses 12 tokens).
+      2. Schedule Request B (remote cache) - only schedule 1st and 2nd
+      3. Free A's cache, then schedule B again (continuation) - schedule 3rd image
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=16,
+        num_blocks=11,  # Can hold 160 tokens (first block is null)
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Limit the number of availiable slots of EncoderCacheManager
+    scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32)
+
+    # Create MM request1
+    NUM_TOKENS_1 = 50  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 12
+    mm_hashes_list_1 = [["hash1_1"]]
+    mm_positions_1 = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1)]]
+
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+        req_ids=["req1"],
+    )[0]
+
+    # Create MM request1 with 3 MM items
+    NUM_TOKENS_2 = 40
+    NUM_ENCODER_TOKENS_2 = 10
+    mm_hashes_list_2 = [["hash2_1", "hash2_2", "hash2_3"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=12, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=24, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=10,
+        req_ids=["req2"],
+    )[0]
+
+    # Mock cache hit: MM of request1 NOT cached remotely, request2 cached remotely
+    scheduler.ec_connector.has_caches = Mock(
+        side_effect=lambda req: [True, True, True] if req == request2 else [False]
+    )
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Now, since encoder cache manager can only store 32 tokens
+    # It should allocated mm item hash1_1, hash2_1 and hash2_2
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+    assert scheduler.get_num_unfinished_requests() == 2
+
+    # Encoder cache should contain mm item from request1
+    _assert_right_encoder_cache_allocated(
+        scheduler, hashes_to_check=["hash1_1", "hash2_1", "hash2_2"]
+    )
+
+    # request2's 2nd mm item is the last call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 1)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of hash2_1 and hash2_2 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0], request2.mm_features[1]]
+    )
+
+    # Should schedule ONLY 1 encoder input
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item of request1
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id, request2.request_id],
+        req_id_to_index={request1.request_id: 0, request2.request_id: 1},
+        sampled_token_ids=[[100], [121]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+    assert scheduler.get_num_unfinished_requests() == 1
+
+    # Schedule again; Now request1's encoder cache should be freed
+    # -> hash2_3 can be scheduled and allocated
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    print(f"Hero: scheduled_tokens for req2: {scheduled_tokens}")
+    print(f"hero: num_scheduled_tokens 2: {output.num_scheduled_tokens}")
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # request2's 3rd mm item is the ONLY call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 2)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash2_3 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[2]]
+    )
+
+    # Should schedule no encoder input
+    _assert_right_encoder_inputs(
+        output,
+        expected_total_reqs=0,
+    )
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
+# ==============================================================================
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 6e739d6b0e77a..3692e633322e2 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -5,6 +5,7 @@ import torch
 
 from vllm.config import (
     CacheConfig,
+    ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
     SchedulerConfig,
@@ -46,6 +47,8 @@ def create_scheduler(
     num_speculative_tokens: int | None = None,
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
 ) -> Scheduler | AsyncScheduler:
     """Create scheduler under test.
 
@@ -107,12 +110,23 @@ def create_scheduler(
             model="ngram", num_speculative_tokens=num_speculative_tokens
         )
 
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
@@ -140,12 +154,14 @@ _none_hash_initialized = False
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
+    mm_hashes_list: list[list[str]] | None = None,
     mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
     stop_token_ids: list[int] | None = None,
     prompt_logprobs: int | None = None,
     same_prompt: bool = False,
     block_size: int = 16,
+    req_ids: list[str] | None = None,
 ) -> list[Request]:
     global _none_hash_initialized
     if not _none_hash_initialized:
@@ -160,25 +176,58 @@ def create_requests(
         prompt_logprobs=prompt_logprobs,
     )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
-                # Dummy hash for each mm item should be unique
-                # since encoder cache tracks entries by hash
+
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
                 identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image",
-                )
-                mm_features.append(mm_feature)
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
 
         prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
         request = Request(
-            request_id=f"{i}",
+            request_id=req_ids[i],
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
new file mode 100644
index 0000000000000..30426e055ade8
--- /dev/null
+++ b/tests/v1/ec_connector/integration/README.md
@@ -0,0 +1,171 @@
+# EPD Correctness Test
+
+This test verifies that EPD (Encoder-Prefill-Decode) disaggregation produces identical outputs to a baseline single instance.
+
+## What It Tests
+
+- **Baseline**: Single vLLM instance serving a multimodal model
+- **EPD (1E+1PD)**: 1 Encoder + 1 Prefill-Decode instance
+- **Baseline (1P+1D)**: 1 Prefill + 1 Decode instance
+- **EPD (1E+1P+1D)**: 1 Encoder + 1 Prefill + 1 Decode instance
+
+The test ensures that disaggregated encoding produces **identical** outputs to the baseline.
+
+Note that currently PD disaggregation set up may give slightly different results from a single instance. Therefore, we need the result from 1P+1D as the baseline for 1E+1P+1D
+
+Please refer to [Disaggregated Encoder Feature](../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features.
+
+## Files
+
+- `run_epd_correctness_test.sh` - Main test script (starts all instances and runs tests)
+- `test_epd_correctness.py` - Python test script (compares outputs)
+
+## Usage
+
+### Multimodal Prompts (Default)
+
+```bash
+cd vllm
+./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs the test with actual multimodal (image) prompts.
+
+### Text-Only Prompts
+
+```bash
+cd vllm
+USE_MM_PROMPTS=0 ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs a quick test with text-only prompts to verify the setup works.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+## How It Works
+
+### Step 1: Baseline
+
+1. Start single vLLM instance on GPU
+2. Run test prompts (multimodal or text-only)
+3. Save outputs to `.vllm_epd_baseline.txt`
+4. Shutdown instance
+
+### Step 2: EPD (1E + 1PD)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+### Step 3: EPD (1E + 1P + 1D)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+## Test Scenarios
+
+### Multimodal Prompts (--use_mm_prompts)
+
+Tests encoder cache transfer:
+
+- Single image query
+- Multiple images in one request
+- Mixed image and text
+- Image with detailed questions
+
+### Text-Only Prompts (default)
+
+Quick sanity check:
+
+- Simple text queries
+- Text-only explanations
+- Verifies proxy routing works
+
+## Expected Behavior
+
+### ✅ Test Passes When
+
+- All disagg outputs match baseline outputs exactly
+- No errors during instance startup
+- Encoder cache is properly saved and loaded
+- Proxy correctly routes requests
+
+### ❌ Test Fails When
+
+- Outputs differ between baseline and disagg
+- Server startup fails
+- Encoder cache not found (should fallback to local execution)
+- Proxy routing errors
+
+## Notes
+
+- The test uses deterministic generation (`temperature=0.0`, `seed=42`)
+- Encoder cache should enable exact output reproduction
+- Test cleans up all instances and cache files after completion
+- Safe to run multiple times (idempotent)
+- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/online_serving/disaggregated_encoder/README.md`
+
+## Requirements
+
+- Multiple GPUs (3 for 1E+1P+1D, 2 for 1E+1PD, 1 for baseline)
+    - 1E+1P+1D is runnable with 2 GPU by assign E and P on the same GPU now.
+- Multimodal model (e.g., Qwen2.5-VL-3B-Instruct)
+- Internet access (for accessing vllm test images)
+
+## Debugging
+
+### Check Logs
+
+Logs and baseline output are saved in `/tmp/` by default.
+Can be customized by changing the environment variables.
+
+### Check Encoder Cache
+
+```bash
+# Verify cache files are created
+ls -la $EC_SHARED_STORAGE_PATH/
+
+# Should see directories with mm_hash names
+# Each containing encoder_cache.safetensors
+```
+
+### Manual Testing
+
+Run individual components:
+
+```bash
+# Baseline only
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode baseline \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+
+# Disagg only (requires baseline output file!)
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode disagg \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+```
diff --git a/tests/v1/ec_connector/integration/hato.jpg b/tests/v1/ec_connector/integration/hato.jpg
new file mode 100644
index 0000000000000..9c7e390e7d7f6
Binary files /dev/null and b/tests/v1/ec_connector/integration/hato.jpg differ
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
new file mode 100644
index 0000000000000..55dd39c0a957f
--- /dev/null
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# EPD (Encoder-Prefill-Decode) Correctness Test
+# 
+# This script tests that EPD disaggregation produces the same outputs as baseline.
+# It runs:
+# 1. Baseline: Single vLLM instance
+# 2. EPD: 1E + 1PD setup
+# 3. Baseline for (E + P + D): 1P + 1D vLLM instances disagg
+# 4. EPD: 1E + 1P + 1D setup
+
+# For GPU usage
+
+# set -xe
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Model to test
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+
+# Set 1 to use multimodal prompts; else to use text-only
+USE_MM_PROMPTS="${USE_MM_PROMPTS:-1}"
+MM_FLAG=""
+if [ $USE_MM_PROMPTS = "1" ]; then
+    MM_FLAG="--use_mm_prompts"
+fi
+
+# GPU configuration
+GPU_E="${GPU_E:-0}"
+GPU_P="${GPU_P:-1}"
+GPU_D="${GPU_D:-2}"
+GPU_SINGLE="${GPU_SINGLE:-$GPU_P}"
+GPU_PD="${GPU_PD:-$GPU_P}"
+
+# Port
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19537}"
+ENDPOINT_PORT="${ENDPOINT_PORT:-10001}"
+
+# Storage path for encoder cache
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache_test}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-600}"
+
+# Output file for baseline comparison and logs
+LOG_PATH="${LOG_PATH:-/tmp}"
+BASELINE_FILE="${BASELINE_FILE:-/tmp/vllm_baseline.txt}"
+BASELINE_PD_FILE="${BASELINE_PD_FILE:-/tmp/vllm_epd_baseline.txt}"
+
+mkdir -p $LOG_PATH
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Wait for server to be ready
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:${port}/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup_instances() {
+    echo "Cleaning up any running vLLM instances..."
+    pkill -f "vllm serve" || true
+    pkill -f "disagg_epd_proxy.py" || true
+    sleep 2
+}
+
+# Function to run baseline (single instance)
+run_baseline() {
+    echo "================================"
+    echo "Running BASELINE (single instance)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    
+    local PORT=$ENDPOINT_PORT
+    
+    # Start baseline instance
+    echo "Starting baseline instance on GPU $GPU_SINGLE, port $PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_SINGLE" vllm serve "$MODEL" \
+        --port $PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        > $LOG_PATH/baseline.log 2>&1 &
+    
+    local BASELINE_PID=$!
+    
+    # Wait for baseline to start
+    echo "Waiting for baseline instance to start..."
+    wait_for_server $PORT
+
+    curl http://127.0.0.1:$PORT/v1/models
+    echo ""
+    
+    # Run test in baseline mode
+    echo "Running baseline..."
+
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PORT" \
+        --model_name "$MODEL" \
+        --mode baseline \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup baseline
+    echo "Stopping baseline instance..."
+    kill $BASELINE_PID 2>/dev/null || true
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1PD
+run_epd_1e_1pd() {
+    echo "================================"
+    echo "Running EPD (1E + 1PD)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_DECODE_PORT=$PREFILL_DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill+decode instance
+    echo "Starting PD instance on GPU $GPU_PD, port $PREFILL_DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+        --port $PREFILL_DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_pd.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for PD instance..."
+    wait_for_server $PREFILL_DECODE_PORT
+
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "disable" \
+        --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+        > $LOG_PATH/1e1pd_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1PD) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1PD) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1PD Correctness Test finished"
+    echo "Stopping EPD (1E+1PD) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run baseline for 1E + 1P + 1D (PD disagg)
+run_baseline_1p_1d() {
+    echo "================================"
+    echo "Running PD BASELINE (1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --prefiller-ports $PREFILL_PORT \
+        --decoder-ports $DECODE_PORT \
+        > $LOG_PATH/1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/healthcheck
+    echo ""
+
+    echo "All PD (1P+1D) services are up!"
+    
+    # Run test in baseline mode
+    echo "Running PD disagg baseline..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode baseline_pd \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "Stopping PD (1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1P + 1D
+run_epd_1e_1p_1d() {
+    echo "================================"
+    echo "Running EPD (1E + 1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1p1d_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1e1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1e1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+        --decode-servers-urls "http://localhost:$DECODE_PORT" \
+        > $LOG_PATH/1e1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1P+1D) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1P+1D) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1P+1D Correctness Test finished"
+    echo "Stopping EPD (1E+1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Main execution
+echo "================================"
+echo "EPD Correctness Test Suite"
+echo "Model: $MODEL"
+echo "================================"
+
+# Step 1: Run baseline
+run_baseline
+
+# Step 2: Test 1E + 1PD
+run_epd_1e_1pd
+
+# Step 3: Test baseline 1P + 1D
+run_baseline_1p_1d
+
+# Step 4: Test 1E + 1P + 1D
+run_epd_1e_1p_1d
+
+# Cleanup output file
+rm -f "$BASELINE_FILE"
+rm -f "$BASELINE_PD_FILE"
+
+echo "================================"
+echo "✓✓ All EPD correctness tests finished!"
+echo "================================"
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
new file mode 100644
index 0000000000000..69c4c58e349b9
--- /dev/null
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+EPD Correctness Test
+
+Tests that EPD (Encoder-Prefill-Decode) disaggregation produces the same
+outputs as a baseline single instance.
+
+Usage:
+    # Baseline mode (saves outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode baseline \
+        --baseline_file .vllm_epd_baseline.txt
+
+    # Disagg mode (compares outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode disagg \
+        --baseline_file .vllm_epd_baseline.txt
+"""
+
+import argparse
+import json
+import os
+import time
+
+import openai
+import requests
+
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import encode_image_base64
+
+MAX_OUTPUT_LEN = 256
+
+# Sample prompts with multimodal content
+image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720))
+image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720))
+
+image_local_path = f"{os.path.dirname(os.path.abspath(__file__))}/hato.jpg"
+
+SAMPLE_PROMPTS_MM: list[dict] = [
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
+                        },
+                    },
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        "description": "Single image query",
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
+                        },
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"file://{image_local_path}"},
+                    },
+                    {"type": "text", "text": "Describe these 2 images in detail."},
+                ],
+            }
+        ],
+        "description": "2 images with detailed query",
+    },
+]
+
+# Text-only prompts for mixed testing
+SAMPLE_PROMPTS_TEXT: list[dict] = [
+    {
+        "messages": [{"role": "user", "content": "What is the capital of France?"}],
+        "description": "Simple text-only query",
+    },
+    {
+        "messages": [
+            {"role": "user", "content": "Explain quantum computing in simple terms."}
+        ],
+        "description": "Text-only explanation request",
+    },
+]
+
+
+def check_vllm_server(url: str, timeout=5, retries=10) -> bool:
+    """Check if the vLLM server is ready.
+
+    Args:
+        url: The URL to check (usually /health or /healthcheck endpoint)
+        timeout: Timeout in seconds for each request
+        retries: Number of retries if the server is not ready
+
+    Returns:
+        True if the server is ready, False otherwise
+    """
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, timeout=timeout)
+            if response.status_code == 200:
+                print(f"Server is ready at {url}")
+                return True
+            else:
+                print(
+                    f"Attempt {attempt + 1}/{retries}: Server returned "
+                    f"status code {response.status_code}"
+                )
+        except requests.exceptions.RequestException as e:
+            print(f"Attempt {attempt + 1}/{retries}: Error connecting: {e}")
+        time.sleep(2)  # Wait before retrying
+    return False
+
+
+def run_chat_completion(
+    base_url: str,
+    model_name: str,
+    messages: list,
+    max_tokens: int = MAX_OUTPUT_LEN,
+) -> str:
+    """Run a chat completion request.
+
+    Args:
+        base_url: Base URL of the vLLM server
+        model_name: Name of the model
+        messages: Messages for chat completion
+        max_tokens: Maximum tokens to generate
+
+    Returns:
+        Generated text content
+    """
+    client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
+
+    completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+    )
+
+    return completion.choices[0].message.content
+
+
+def main():
+    """Main test function."""
+    parser = argparse.ArgumentParser(
+        description="EPD correctness test - compare disagg vs baseline"
+    )
+
+    parser.add_argument(
+        "--service_url",
+        type=str,
+        required=True,
+        help="The vLLM service URL (e.g., http://localhost:8000)",
+    )
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="Model name",
+    )
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="baseline",
+        choices=["baseline", "baseline_pd", "disagg"],
+        help="Mode: baseline/baseline_pd (saves outputs) or disagg (compares outputs)",
+    )
+
+    parser.add_argument(
+        "--baseline_file",
+        type=str,
+        default=".vllm_epd_baseline.txt",
+        help="File to save/load baseline outputs",
+    )
+
+    parser.add_argument(
+        "--use_mm_prompts",
+        action="store_true",
+        help="Use multimodal prompts (default: use text-only for quick testing)",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Service URL: {args.service_url}")
+    print(f"Model: {args.model_name}")
+    print(f"Mode: {args.mode}")
+    print(f"Output file: {args.baseline_file}")
+    print(f"Use MM prompts: {args.use_mm_prompts}")
+
+    # Determine health check endpoint
+    if args.mode == "baseline":
+        health_check_url = f"{args.service_url}/health"
+    elif args.mode == "baseline_pd":
+        # Nixl toy proxy use /healthcheck
+        health_check_url = f"{args.service_url}/healthcheck"
+    else:
+        # Disagg EPD proxy uses /health
+        health_check_url = f"{args.service_url}/health"
+        if not os.path.exists(args.baseline_file):
+            raise ValueError(
+                f"In disagg mode, the output file {args.baseline_file} from "
+                "baseline does not exist. Run baseline mode first."
+            )
+
+    # Check if server is ready
+    if not check_vllm_server(health_check_url):
+        raise RuntimeError(f"vLLM server at {args.service_url} is not ready!")
+
+    # Select prompts to use
+    if args.use_mm_prompts:
+        test_prompts = SAMPLE_PROMPTS_MM
+        print("Using multimodal prompts")
+    else:
+        test_prompts = SAMPLE_PROMPTS_TEXT
+        print("Using text-only prompts for quick testing")
+
+    # Run completions
+    service_url = f"{args.service_url}/v1"
+    output_strs = {}
+
+    for i, prompt_data in enumerate(test_prompts):
+        print(
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
+                prompt_data['description']
+            }"
+        )
+
+        output_str = run_chat_completion(
+            base_url=service_url,
+            model_name=args.model_name,
+            messages=prompt_data["messages"],
+            max_tokens=MAX_OUTPUT_LEN,
+        )
+
+        # Use description as key for comparison
+        key = prompt_data["description"]
+        output_strs[key] = output_str
+        print(f"Output: {output_str}")
+
+    if args.mode in ("baseline", "baseline_pd"):
+        # Baseline mode: Save outputs
+        print(f"\nSaving baseline outputs to {args.baseline_file}")
+        try:
+            with open(args.baseline_file, "w") as json_file:
+                json.dump(output_strs, json_file, indent=4)
+            print("✅ Baseline outputs saved successfully")
+        except OSError as e:
+            print(f"Error writing to file: {e}")
+            raise
+    else:
+        # Disagg mode: Load and compare outputs
+        print(f"\nLoading baseline outputs from {args.baseline_file}")
+        baseline_outputs = None
+        try:
+            with open(args.baseline_file) as json_file:
+                baseline_outputs = json.load(json_file)
+        except OSError as e:
+            print(f"Error reading from file: {e}")
+            raise
+
+        # Verify outputs match
+        print("\nComparing disagg outputs with baseline...")
+        assert isinstance(baseline_outputs, dict), "Baseline outputs should be a dict"
+        assert len(baseline_outputs) == len(output_strs), (
+            f"Length mismatch: baseline has {len(baseline_outputs)}, "
+            f"disagg has {len(output_strs)}"
+        )
+
+        all_match = True
+        for key, baseline_output in baseline_outputs.items():
+            assert key in output_strs, f"{key} not in disagg outputs"
+
+            disagg_output = output_strs[key]
+            if baseline_output == disagg_output:
+                print(f"✅ {key}: MATCH")
+            else:
+                print(f"❌ {key}: MISMATCH")
+                print(f"  Baseline: {baseline_output}")
+                print(f"  Disagg:   {disagg_output}")
+                all_match = False
+
+        assert all_match, "❌❌Disagg outputs do not match baseline!❌❌"
+        if all_match:
+            print("\n✅ All outputs match! Test PASSED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
new file mode 100644
index 0000000000000..a58daa2628e21
--- /dev/null
+++ b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
@@ -0,0 +1,609 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for ECSharedStorageConnector.
+"""
+
+import os
+from unittest.mock import Mock, patch
+
+import pytest
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole
+from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
+    ECSharedStorageConnector,
+    ECSharedStorageConnectorMetadata,
+    MMMeta,
+)
+from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
+from vllm.v1.core.sched.output import SchedulerOutput
+
+
+# ------------------ Mock Classes ------------------ #
+class MockRequest:
+    def __init__(self, request_id, mm_hashes: list[str], token_counts: list[int]):
+        assert len(mm_hashes) == len(token_counts)
+        self.request_id = request_id
+        self._token_counts = token_counts
+        self.mm_features = []
+        for i, mm_hash in enumerate(mm_hashes):
+            feature = MultiModalFeatureSpec(
+                data=None,
+                modality="image",
+                identifier=mm_hash,
+                mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
+            )
+            self.mm_features.append(feature)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self._token_counts)
+        return self._token_counts[input_id]
+
+
+@pytest.fixture
+def temp_storage(tmp_path):
+    """Fixture providing temporary storage path."""
+    return str(tmp_path)
+
+
+@pytest.fixture
+def mock_vllm_config_producer(temp_storage):
+    """Fixture providing mock VllmConfig for producer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = True
+    return config
+
+
+@pytest.fixture
+def mock_vllm_config_consumer(temp_storage):
+    """Fixture providing mock VllmConfig for consumer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = False
+    return config
+
+
+@pytest.fixture
+def mock_request_with_3_mm():
+    """Fixture providing mock Request with 3 multimodal items."""
+    request_id = "test_req_123"
+    mm_hashes = ["img_hash_1", "img_hash_2", "img_hash_3"]
+    token_counts = [100, 150, 200]
+
+    request = MockRequest(request_id, mm_hashes, token_counts)
+    return request
+
+
+# ------------------ Unit Tests ------------------ #
+class TestECSharedStorageConnectorBasics:
+    """Test basic EC connector functionality."""
+
+    def test_initialization_producer(self, mock_vllm_config_producer, temp_storage):
+        """Test connector initializes correctly as producer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        assert connector.role == ECConnectorRole.SCHEDULER
+        assert connector.is_producer
+        assert connector._storage_path == temp_storage
+        assert connector._mm_datas_need_loads == {}
+
+    def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage):
+        """Test connector initializes correctly as consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert connector.role == ECConnectorRole.WORKER
+        assert not connector.is_producer
+        assert connector._storage_path == temp_storage
+
+    def test_role_assignment(self, mock_vllm_config_producer):
+        """Test role is correctly assigned."""
+        scheduler_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+        worker_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert scheduler_connector.role == ECConnectorRole.SCHEDULER
+        assert worker_connector.role == ECConnectorRole.WORKER
+
+
+class TestCacheExistence:
+    """Test cache existence checking using has_caches() API."""
+
+    def test_has_caches_all_exist_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+    ):
+        """Test has_caches returns True when all 3 caches exist."""
+        # Test for producer first
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create cache files using save_caches (proper way)
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_feature in mock_request_with_3_mm.mm_features:
+            mm_hash = mm_feature.identifier
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(encoder_cache, mm_hash)
+
+        # Test using has_caches API
+        producer_result = producer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(producer_result) == 3
+        assert all(producer_result), f"Expected all True, got {producer_result}"
+
+        # Also test consumer can check if cache exists
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test using has_caches API
+        consumer_result = consumer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(consumer_result) == 3
+        assert all(consumer_result), f"Expected all True, got {consumer_result}"
+
+    def test_has_caches_none_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches returns False when no caches exist."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test without creating any files
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not any(result), f"Expected all False, got {result}"
+
+    def test_has_caches_partial_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches with some caches existing (1 of 3)."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create only the second cache file
+        mm_hash_second = mock_request_with_3_mm.mm_features[1].identifier
+        encoder_cache = {mm_hash_second: torch.randn(10, 768)}
+        connector.save_caches(encoder_cache, mm_hash_second)
+
+        # Test
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not result[0]  # First doesn't exist
+        assert result[1]  # Second exists
+        assert not result[2]  # Third doesn't exist
+
+
+class TestStateManagement:
+    """Test connector state management."""
+
+    def test_update_state_after_alloc_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test state update after allocation for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Initial state should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Update state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Check state updated for all 3
+        assert len(connector._mm_datas_need_loads) == 3
+        assert "img_hash_1" in connector._mm_datas_need_loads
+        assert "img_hash_2" in connector._mm_datas_need_loads
+        assert "img_hash_3" in connector._mm_datas_need_loads
+        assert connector._mm_datas_need_loads["img_hash_1"] == 100
+        assert connector._mm_datas_need_loads["img_hash_2"] == 150
+        assert connector._mm_datas_need_loads["img_hash_3"] == 200
+
+    def test_build_connector_meta_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test metadata building for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Setup state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Build metadata
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        # Assert
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 3
+        assert metadata.mm_datas[0].mm_hash == "img_hash_1"
+        assert metadata.mm_datas[0].num_token == 100
+        assert metadata.mm_datas[1].mm_hash == "img_hash_2"
+        assert metadata.mm_datas[1].num_token == 150
+        assert metadata.mm_datas[2].mm_hash == "img_hash_3"
+        assert metadata.mm_datas[2].num_token == 200
+
+        # State should be cleared after building
+        assert len(connector._mm_datas_need_loads) == 0
+
+    def test_build_connector_meta_empty(self, mock_vllm_config_producer):
+        """Test metadata building with empty state."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 0
+
+    def test_state_cleared_after_metadata_build(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test that state is properly cleared after building metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Add state
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+        assert len(connector._mm_datas_need_loads) == 3
+
+        # Build metadata (should clear state)
+        scheduler_output = Mock(spec=SchedulerOutput)
+        connector.build_connector_meta(scheduler_output)
+
+        # State should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Build again should return empty metadata
+        metadata2 = connector.build_connector_meta(scheduler_output)
+        assert len(metadata2.mm_datas) == 0
+
+
+class TestCacheSaving:
+    """Test encoder cache saving (producer only)."""
+
+    def test_save_caches_producer_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage
+    ):
+        """Test cache saving as producer for 3 different MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Create and save 3 different caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_hash in mm_hashes:
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify all files exist using has_caches
+        result = connector.has_caches(mock_request_with_3_mm)
+        assert all(result), f"Not all caches were saved: {result}"
+
+        # Verify each file's content
+        for mm_hash in mm_hashes:
+            filename = connector._generate_filename_debug(mm_hash)
+            loaded = safetensors.torch.load_file(filename)
+            assert "ec_cache" in loaded
+            assert torch.allclose(loaded["ec_cache"], encoder_cache[mm_hash].cpu())
+
+    def test_save_caches_consumer_skips(self, mock_vllm_config_consumer):
+        """Test cache saving is skipped for consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_hash_consumer"
+        encoder_cache = {mm_hash: torch.randn(10, 768)}
+
+        # Save should not raise but also not create file
+        connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify file doesn't exist using has_caches
+        mock_request = MockRequest("req_consumer", [mm_hash], [10])
+        result = connector.has_caches(mock_request)
+        assert not result[0], "Consumer should not save caches"
+
+
+class TestCacheLoading:
+    """Test encoder cache loading (consumer)."""
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_start_load_caches_consumer_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+        temp_storage,
+    ):
+        """Test consumer loads 3 caches from storage."""
+        # First, create producer to save caches
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Producer saves 3 caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        saved_caches = {}
+        for mm_hash in mm_hashes:
+            saved_caches[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(saved_caches, mm_hash)
+
+        # Now consumer loads
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup metadata for all 3
+        metadata = ECSharedStorageConnectorMetadata()
+        for mm_hash in mm_hashes:
+            metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Load
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Verify all 3 loaded
+        assert len(encoder_cache) == 3
+        for mm_hash in mm_hashes:
+            assert mm_hash in encoder_cache, f"{mm_hash} missing in encoder_cache"
+            assert encoder_cache[mm_hash].is_cuda, (
+                f"{mm_hash} cache is in {encoder_cache[mm_hash].device}"
+            )
+            assert torch.allclose(
+                encoder_cache[mm_hash].cpu(), saved_caches[mm_hash]
+            ), f"{mm_hash} cache saved and loaded tesnor are not the same"
+
+    def test_start_load_caches_skip_existing(
+        self, mock_vllm_config_producer, mock_vllm_config_consumer, temp_storage
+    ):
+        """Test cache loading skips already cached items."""
+        # Setup: producer saves cache
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "existing_hash"
+        saved_cache = torch.randn(10, 768)
+        producer.save_caches({mm_hash: saved_cache}, mm_hash)
+
+        # Consumer setup
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Pre-populate encoder_cache with different value
+        existing_cache = torch.randn(5, 512)
+        encoder_cache = {mm_hash: existing_cache}
+
+        # Load (should skip since already exists)
+        with patch("safetensors.torch.load_file") as mock_load:
+            consumer.start_load_caches(encoder_cache=encoder_cache)
+            # Should not call load_file since cache exists
+            mock_load.assert_not_called()
+
+        # Verify original cache unchanged
+        assert torch.equal(encoder_cache[mm_hash], existing_cache)
+
+    def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer):
+        """Test loading with empty metadata does nothing."""
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup empty metadata
+        metadata = ECSharedStorageConnectorMetadata()
+        consumer.bind_connector_metadata(metadata)
+
+        # Load (should not raise)
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Cache should remain empty
+        assert len(encoder_cache) == 0
+
+
+class TestFilenameGeneration:
+    """Test filename and path generation."""
+
+    def test_generate_foldername(self, mock_vllm_config_producer, temp_storage):
+        """Test folder name generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_folder_hash"
+        folder = connector._generate_foldername_debug(mm_hash)
+
+        assert folder == os.path.join(temp_storage, mm_hash)
+        assert os.path.isdir(folder)  # Should be created
+
+    def test_generate_filename(self, mock_vllm_config_producer, temp_storage):
+        """Test filename generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_file_hash"
+        filename = connector._generate_filename_debug(mm_hash)
+
+        expected = os.path.join(temp_storage, mm_hash, "encoder_cache.safetensors")
+        assert filename == expected
+        assert os.path.isdir(os.path.dirname(filename))  # Folder created
+
+    def test_generate_filename_consistency(self, mock_vllm_config_producer):
+        """Test filename generation is consistent."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "consistency_hash"
+        filename1 = connector._generate_filename_debug(mm_hash)
+        filename2 = connector._generate_filename_debug(mm_hash)
+
+        assert filename1 == filename2
+
+
+class TestMetadataBindingLifecycle:
+    """Test metadata binding and clearing lifecycle."""
+
+    def test_bind_connector_metadata(self, mock_vllm_config_consumer):
+        """Test binding connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("hash_1", 100))
+
+        connector.bind_connector_metadata(metadata)
+
+        assert connector._connector_metadata is metadata
+
+    def test_clear_connector_metadata(self, mock_vllm_config_consumer):
+        """Test clearing connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        connector.clear_connector_metadata()
+
+        assert connector._connector_metadata is None
+
+    def test_get_connector_metadata(self, mock_vllm_config_consumer):
+        """Test getting connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        retrieved = connector._get_connector_metadata()
+
+        assert retrieved is metadata
+
+    def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer):
+        """Test getting metadata when not set raises."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        with pytest.raises(AssertionError):
+            connector._get_connector_metadata()
+
+
+class TestEdgeCases:
+    """Test edge cases and error handling."""
+
+    def test_save_empty_cache(self, mock_vllm_config_producer):
+        """Test saving empty tensor."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "empty_hash"
+        encoder_cache = {mm_hash: torch.empty(0)}
+
+        # Should not raise
+        connector.save_caches(encoder_cache, mm_hash)
+
+    def test_load_nonexistent_cache(self, mock_vllm_config_consumer):
+        """Test loading cache that doesn't exist raises error."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100))
+        connector.bind_connector_metadata(metadata)
+
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        # Should raise FileNotFoundError
+        with pytest.raises(FileNotFoundError):
+            connector.start_load_caches(encoder_cache=encoder_cache)
+
+    def test_has_caches_empty_request(self, mock_vllm_config_producer):
+        """Test has_caches with request that has no MM data."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        mock_request = MockRequest("req_empty", [], [])
+
+        result = connector.has_caches(mock_request)
+
+        assert len(result) == 0
+        assert result == []
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 84441aa7d28ca..4e852dca95eb0 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -10,6 +10,14 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.config import (
+    CacheConfig,
+    ECTransferConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_default_torch_num_threads
@@ -450,3 +458,141 @@ def test_engine_core_invalid_request_id_type():
     engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
     assert len(engine_core.scheduler.waiting) == 1
     assert len(engine_core.scheduler.running) == 0
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    ("ec_role", "gpu_memory_utilization", "enable_prefix_caching"),
+    [
+        ("ec_producer", 0.01, False),
+        # NOTE: ec_producer never allows prefix caching
+        ("ec_consumer", 0.7, True),
+        ("ec_consumer", 0.7, False),
+    ],
+)
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_encoder_instance_zero_kv_cache(
+    ec_role: str,
+    gpu_memory_utilization: float,
+    enable_prefix_caching: bool,
+    use_kv_connector: bool,
+):
+    """EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests
+
+    This test verifies encoder-only instance initializes with 0 KV cache blocks.
+    Under EPD disagg mode, Encoder instances (EC producer role) only execute
+    vision encoder, so they don't need KV cache for text generation.
+    """
+    # Form vllm config
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+    )
+    model_config = ModelConfig(
+        model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
+        enforce_eager=True,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=gpu_memory_utilization,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
+    ec_transfer_config = ECTransferConfig(
+        ec_connector="ECSharedStorageConnector",
+        ec_role=ec_role,
+        ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        kv_transfer_config=kv_transfer_config,
+        ec_transfer_config=ec_transfer_config,
+    )
+
+    executor_class = Executor.get_class(vllm_config)
+    print(f"executor_class: {executor_class}")
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+
+    # Check encoder cache manager exists
+    assert engine_core.scheduler.encoder_cache_manager is not None, (
+        "encoder_cache_manager should exist"
+    )
+
+    if ec_role == "ec_producer":
+        # Check 1: num_blocks should be 0
+        # NOTE: num_blocks=1 as BlockPool always needs a null_block.
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks == 1, (
+            f"ec_producer should only have 1 KV blocks, "
+            f"got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should be empty
+        assert len(kv_cache_config.kv_cache_groups) == 0, (
+            f"ec_producer should have 0 KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: kv_cache_tensors should be empty
+        assert len(kv_cache_config.kv_cache_tensors) == 0, (
+            f"Encoder instance should have 0 KV cache tensors, "
+            f"got {len(kv_cache_config.kv_cache_tensors)}"
+        )
+
+        # Check 4: Verify EC connector is initialized and is producer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Encoder instance should have EC connector"
+        )
+        assert engine_core.scheduler.ec_connector.is_producer, (
+            "Encoder instance EC connector should be producer"
+        )
+
+        # Check 5: Verify chunked prefill is disabled
+        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+            "Encoder instance should disable chunked prefill (no KV cache)"
+        )
+
+    elif ec_role == "ec_consumer":
+        # Check 1: num_blocks should be > 1
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks > 1, (
+            f"ec_consumer should have >1 KV blocks, got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should NOT be empty
+        assert len(kv_cache_config.kv_cache_groups) > 0, (
+            f"ec_consumer should have KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: Verify EC connector is consumer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Consumer instance should have EC connector"
+        )
+        assert not engine_core.scheduler.ec_connector.is_producer, (
+            "Consumer instance EC connector should be consumer"
+        )
diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py
new file mode 100644
index 0000000000000..736c0e54837fe
--- /dev/null
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.parallel_sampling import ParentRequest
+
+
+def test_parent_request_to_output_stream() -> None:
+    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+
+    # output_1 finished
+    output_1.finish_reason = "ended"
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    # Finished output_1 had already returned, DO NOT returned again
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+
+    # output_0 finished
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    # Finished output_0 had already returned, DO NOT returned again
+    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+
+
+def test_parent_request_to_output_final_only() -> None:
+    parent_request = ParentRequest(
+        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+    )
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished, return nothing
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_1 finished, but outputs won't be returned until all child requests finished
+    output_1.finish_reason = "ended"
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_0 finished, as all child requests finished, the output would be returned
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 23684a2c55cef..3541ef89bfc14 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -5,6 +5,7 @@ import random
 from dataclasses import dataclass
 from typing import TypeAlias
 
+import numpy as np
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -369,9 +370,9 @@ class MockEngineCore:
                         self.generated_logprobs_raw[req_idx][token_idx]
                     )
                     logprobs = LogprobsLists(
-                        [logprobs_token_ids_],
-                        [logprobs_],
-                        [sampled_token_ranks_],
+                        np.array([logprobs_token_ids_]),
+                        np.array([logprobs_]),
+                        np.array([sampled_token_ranks_]),
                     )
                 else:
                     logprobs = None
diff --git a/tests/v1/entrypoints/openai/serving_responses/conftest.py b/tests/v1/entrypoints/openai/serving_responses/conftest.py
index 8081e5fa1d837..b948b6d058a5d 100644
--- a/tests/v1/entrypoints/openai/serving_responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@@ -30,7 +30,10 @@ def server_with_store(default_server_args):
     with RemoteOpenAIServer(
         MODEL_NAME,
         default_server_args,
-        env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+        env_dict={
+            "VLLM_ENABLE_RESPONSES_API_STORE": "1",
+            "VLLM_SERVER_DEV_MODE": "1",
+        },
     ) as remote_server:
         yield remote_server
 
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
index cf57956a9dea7..90161e7c221b7 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@@ -116,6 +116,7 @@ async def test_function_tool_use(
         input=prompt,
         tools=tools,
         tool_choice=tool_choice,
+        temperature=0.0,
     )
 
     assert len(response.output) >= 1
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index 522c72b559556..b5aa20448dfcb 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -138,3 +138,23 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                 "structured_outputs": {"grammar": invalid_simplified_sql_grammar}
             },
         )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_empty_grammar(client: openai.AsyncOpenAI, model_name: str) -> None:
+    prompt = "Say hello"
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"structured_outputs": {"grammar": ""}},
+        )
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 475cf2285e394..8e421717fea30 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1096,7 +1096,8 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
     llm.llm_engine.engine_core.shutdown()
 
 
-def test_register_kv_caches(dist_init):
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"])
+def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
@@ -1108,10 +1109,22 @@ def test_register_kv_caches(dist_init):
        block layout info
     """
 
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
     vllm_config = create_vllm_config()
 
+    # Import the appropriate backend based on the parameter
+    if attn_backend == "FLASH_ATTN":
+        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+
+        backend_cls = FlashAttentionBackend
+    else:  # TRITON_ATTN
+        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
+
+        backend_cls = TritonAttentionBackend
+
     # Create test kv cache tensors using proper backend shape
-    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+    kv_cache_shape = backend_cls.get_kv_cache_shape(
         num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
     )
     shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
diff --git a/tests/v1/kv_offload/test_cpu_manager.py b/tests/v1/kv_offload/test_cpu_manager.py
index 4f90ca022ceff..839cd9b6dc55c 100644
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/test_cpu_manager.py
@@ -11,6 +11,7 @@ from vllm.v1.kv_offload.abstract import (
     OffloadingEvent,
     PrepareStoreOutput,
 )
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
@@ -187,3 +188,310 @@ def test_cpu_manager():
         expected_stores=({3, 4, 5}, {6, 7, 8}),
         expected_evictions=({2, 3, 4}, {8}),
     )
+
+
+def test_arc_manager_basic():
+    """
+    Tests ARCOffloadingManager basic operations with a CPUBackend.
+    Verifies that ARC handles store, load, and lookup operations correctly.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ),
+    )
+
+    # lookup [1, 2] -> not ready
+    assert arc_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(arc_manager.take_events()) == []
+
+    # complete store [1, 2]
+    arc_manager.complete_store(to_hashes([1, 2]))
+    verify_events(
+        arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
+
+    # lookup [1, 2]
+    assert arc_manager.lookup(to_hashes([1])) == 1
+    assert arc_manager.lookup(to_hashes([1, 2])) == 2
+    assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # blocks should be in T1 (recent)
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 0
+
+
+def test_arc_manager_t1_to_t2_promotion():
+    """
+    Tests that accessing a block in T1 promotes it to T2 (frequent).
+    This is a key feature of ARC's adaptive behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store and complete block 1
+    arc_manager.prepare_store(to_hashes([1]))
+    arc_manager.complete_store(to_hashes([1]))
+
+    # block 1 starts in T1 (recent)
+    assert to_hashes([1])[0] in arc_manager.t1
+    assert to_hashes([1])[0] not in arc_manager.t2
+
+    # touch block 1 (simulate second access)
+    arc_manager.touch(to_hashes([1]))
+
+    # block 1 should now be in T2 (frequent)
+    assert to_hashes([1])[0] not in arc_manager.t1
+    assert to_hashes([1])[0] in arc_manager.t2
+
+
+def test_arc_manager_eviction_with_load():
+    """
+    Tests ARC eviction behavior similar to LRU test.
+    Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare and complete store [1, 2, 3, 4]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2, 3, 4],
+            store_block_ids=[0, 1, 2, 3],
+            block_hashes_evicted=[],
+        ),
+    )
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare load [2, 3] (increases ref_cnt)
+    prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store [5, 6, 7] with [2, 3] being loaded
+    # should fail because [2, 3] have ref_cnt > 0
+    assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
+
+    # complete load [2, 3]
+    arc_manager.complete_load(to_hashes([2, 3]))
+
+    # now prepare store [5, 6, 7] should succeed
+    # ARC will evict blocks one at a time from T1 as needed
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
+    assert prepare_store_output is not None
+    # Should successfully evict enough blocks to make room (at least 1)
+    assert len(prepare_store_output.block_hashes_evicted) >= 1
+
+
+def test_arc_manager_adaptive_target():
+    """
+    Tests ARC's adaptive target adjustment via ghost lists.
+    When a block in B1 (ghost list) is accessed, target_t1_size increases.
+    When a block in B2 is accessed, target_t1_size decreases.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2 (fills cache)
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    initial_target = arc_manager.target_t1_size
+
+    # store block 3, evicting block 1 (moves to B1 ghost list)
+    arc_manager.prepare_store(to_hashes([3]))
+    arc_manager.complete_store(to_hashes([3]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+
+    # touch block 1 (cache miss, but in B1)
+    # this should increase target_t1_size (favor recency)
+    arc_manager.touch(to_hashes([1]))
+
+    # target should have increased
+    assert arc_manager.target_t1_size > initial_target
+
+
+def test_arc_manager_t1_t2_eviction_policy():
+    """
+    Tests that ARC evicts from T1 or T2 based on target_t1_size.
+    If |T1| >= target_t1_size, evict from T1, otherwise from T2.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote blocks 3, 4 to T2 by touching them
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # now: T1 = {1, 2}, T2 = {3, 4}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # set target_t1_size to prefer evicting from T1
+    # (when |T1| >= target, evict from T1)
+    arc_manager.target_t1_size = 1
+
+    # store block 5, should evict from T1 (block 1, LRU in T1)
+    output = arc_manager.prepare_store(to_hashes([5]))
+    assert output is not None
+    assert to_hashes([1]) == output.block_hashes_evicted
+
+    arc_manager.complete_store(to_hashes([5]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+    # block 5 should be in T1
+    assert to_hashes([5])[0] in arc_manager.t1
+
+
+def test_arc_manager_ghost_list_bounds():
+    """
+    Tests that ghost lists (B1, B2) don't grow unbounded.
+    They should be capped at cache_capacity.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # fill cache with blocks 1, 2
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store many blocks to fill ghost lists
+    for i in range(3, 20):
+        arc_manager.prepare_store(to_hashes([i]))
+        arc_manager.complete_store(to_hashes([i]))
+
+    # ghost lists should not exceed cache_capacity
+    assert len(arc_manager.b1) <= arc_manager.cache_capacity
+    assert len(arc_manager.b2) <= arc_manager.cache_capacity
+
+
+def test_arc_manager_touch_ordering():
+    """
+    Tests that touch() correctly updates access patterns.
+    Similar to LRU test but verifies T1/T2 ordering.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote 3, 4 to T2
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # T1 = {1, 2}, T2 = {3, 4}
+    # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
+    arc_manager.touch(to_hashes([1, 3, 4]))
+
+    # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
+    assert len(arc_manager.t1) == 1
+    assert len(arc_manager.t2) == 3
+
+    # store block 5, should evict from T1 (block 2, only one in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[5],
+            store_block_ids=[1],  # reuses block 2's storage
+            block_hashes_evicted=[2],
+        ),
+    )
+
+
+def test_arc_manager_failed_store():
+    """
+    Tests that failed store operations clean up correctly.
+    Similar to LRU test but for ARC.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare store block 5 (will evict block 1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+
+    # complete store with failure
+    arc_manager.complete_store(to_hashes([5]), success=False)
+
+    # block 5 should not be in cache
+    assert arc_manager.lookup(to_hashes([5])) == 0
+    # block 5 should not be in T1 or T2
+    assert to_hashes([5])[0] not in arc_manager.t1
+    assert to_hashes([5])[0] not in arc_manager.t2
+
+    # evicted block should still be gone (in B1 ghost list)
+    evicted_hash = prepare_store_output.block_hashes_evicted[0]
+    assert evicted_hash in arc_manager.b1
+
+
+def test_arc_manager_full_scenario():
+    """
+    Comprehensive test covering multiple ARC operations in sequence.
+    Similar to the full LRU test but adapted for ARC behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store [1, 2]
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store [3, 4, 5] -> evicts [1]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+    arc_manager.complete_store(to_hashes([3, 4, 5]))
+
+    # promote some blocks to T2
+    arc_manager.touch(to_hashes([2, 3]))
+
+    # T1 has {4, 5}, T2 has {2, 3}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # store [6] -> should evict from T1 (4 is oldest in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
+    assert prepare_store_output is not None
+    arc_manager.complete_store(to_hashes([6]))
+
+    # verify blocks 2, 3 (in T2) are still present
+    assert arc_manager.lookup(to_hashes([2])) == 1
+    assert arc_manager.lookup(to_hashes([3])) == 1
+
+    # verify events
+    events = list(arc_manager.take_events())
+    assert len(events) > 0  # should have store and eviction events
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 47d05a20a65df..89d0ec769ac09 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -13,7 +13,7 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -534,11 +534,17 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
     sampling_metadata = mock.MagicMock()
 
     if attn_backend == "FLASH_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.FLASH_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.FLASH_ATTN
+        )
     elif attn_backend == "TRITON_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TRITON_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TRITON_ATTN
+        )
     elif attn_backend == "TREE_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TREE_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TREE_ATTN
+        )
     else:
         raise ValueError(f"Unsupported attention backend: {attn_backend}")
 
@@ -673,7 +679,9 @@ def test_propose_tree(spec_token_tree):
     proposer.attn_layer_names = ["layer.0"]
 
     # Get the tree attention metadata builder.
-    attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TREE_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.TREE_ATTN
+    )
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         layer_names=proposer.attn_layer_names,
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 9ca7cf9e3e0e1..6d59b58e739eb 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -12,7 +12,7 @@ from tests.v1.attention.utils import (
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -177,7 +177,9 @@ def test_mtp_propose(num_speculative_tokens, monkeypatch):
     sampling_metadata = mock.MagicMock()
 
     # Setup attention metadata
-    attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.FLASH_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.FLASH_ATTN
+    )
 
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index b365e75d5514c..6958d62dc7e90 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -10,7 +10,7 @@ from tests.v1.attention.utils import (
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
@@ -35,7 +35,7 @@ def forward_attention(
     block_table: torch.Tensor,
     slot_mapping: torch.Tensor,
     seqlen_k: int,
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     spec_token_tree: str | None = None,
     num_spec_tokens: int = 0,
 ) -> torch.Tensor:
@@ -241,7 +241,7 @@ def test_tree_attn_correctness() -> None:
                         block_table=block_table,
                         slot_mapping=tree_slot_mapping,
                         seqlen_k=seqlen_k,
-                        backend=_Backend.TREE_ATTN,
+                        backend=AttentionBackendEnum.TREE_ATTN,
                         spec_token_tree=spec_token_tree,
                         num_spec_tokens=tree_size_q - 1,
                     ).view(batch_size, -1, num_heads, dim_per_head)
@@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
                             block_table=block_table,
                             slot_mapping=branch_slot_mapping,
                             seqlen_k=sequence_position + q_len,
-                            backend=_Backend.FLASH_ATTN,
+                            backend=AttentionBackendEnum.FLASH_ATTN,
                         ).view(batch_size, -1, num_heads, dim_per_head)
 
                         # Compare the outputs.
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index bc624658308bf..b02d9a657407b 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -185,9 +185,7 @@ def _make_mock_backend_for_kernel_block_size(
     supported_sizes: list[int | MultipleOf],
 ):
     class _MockBackend:
-        @staticmethod
-        def get_supported_kernel_block_size():
-            return supported_sizes
+        supported_kernel_block_sizes = supported_sizes
 
     return _MockBackend()
 
@@ -466,13 +464,20 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
     n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
-    expected_kv_cache_shape = [
-        2,
-        NUM_BLOCKS,
-        BLOCK_SIZE,
-        n_heads,
-        model_runner.model_config.get_head_size(),
-    ]
+    head_size = model_runner.model_config.get_head_size()
+
+    # Get the expected shape from the backend's get_kv_cache_shape method
+    # to ensure compatibility with different backends (triton vs flexattention)
+    attn_backend = None
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
+        break
+
+    assert attn_backend is not None, "No attention backend found"
+    expected_kv_cache_shape = list(
+        attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
+    )
+
     # TODO mla test
     default_stride = tuple(range(5))
     # Permutation that gets you back to expected kv shape
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 8d35aa65738b2..5508e59bcd2f5 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -32,13 +32,13 @@ def if_aiter_supported(func: Callable) -> Callable:
     def wrapper(*args, **kwargs):
         # checks the platform, device arch and aiter library existance.
 
-        from vllm.platforms.rocm import on_gfx9
+        if current_platform.is_rocm() and IS_AITER_FOUND:
+            from vllm.platforms.rocm import on_gfx9
 
-        if current_platform.is_rocm() and on_gfx9() and IS_AITER_FOUND:
-            return func(*args, **kwargs)
-        else:
-            # Return None or do nothing if not supported
-            return None
+            if on_gfx9():
+                return func(*args, **kwargs)
+
+        return None
 
     return wrapper
 
@@ -296,7 +296,7 @@ def _rocm_aiter_mla_decode_fwd_fake(
     pass
 
 
-def _rocm_aiter_gemm_w8a8_impl(
+def _rocm_aiter_gemm_a8w8_impl(
     A: torch.Tensor,
     B: torch.Tensor,
     As: torch.Tensor,
@@ -313,7 +313,7 @@ def _rocm_aiter_gemm_w8a8_impl(
     return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
 
 
-def _rocm_aiter_gemm_w8a8_fake(
+def _rocm_aiter_gemm_a8w8_fake(
     A: torch.Tensor,
     B: torch.Tensor,
     As: torch.Tensor,
@@ -327,7 +327,7 @@ def _rocm_aiter_gemm_w8a8_fake(
     return Y
 
 
-def _rocm_aiter_gemm_w8a8_blockscale_impl(
+def _rocm_aiter_gemm_a8w8_blockscale_impl(
     A: torch.Tensor,
     B: torch.Tensor,
     As: torch.Tensor,
@@ -339,7 +339,7 @@ def _rocm_aiter_gemm_w8a8_blockscale_impl(
     return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
 
 
-def _rocm_aiter_gemm_w8a8_blockscale_fake(
+def _rocm_aiter_gemm_a8w8_blockscale_fake(
     A: torch.Tensor,
     B: torch.Tensor,
     As: torch.Tensor,
@@ -419,6 +419,7 @@ class rocm_aiter_ops:
     _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
     _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
     _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+    _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
 
     @classmethod
     @if_aiter_supported
@@ -494,6 +495,11 @@ class rocm_aiter_ops:
     def is_triton_rotary_embed_enabled(cls) -> bool:
         return cls._AITER_ENABLED and cls._TRITON_ROTARY_EMBED
 
+    @classmethod
+    @if_aiter_supported
+    def is_triton_gemm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM
+
     @staticmethod
     @if_aiter_supported
     def register_ops_once() -> None:
@@ -555,18 +561,18 @@ class rocm_aiter_ops:
             )
 
             direct_register_custom_op(
-                op_name="rocm_aiter_gemm_w8a8",
-                op_func=_rocm_aiter_gemm_w8a8_impl,
+                op_name="rocm_aiter_gemm_a8w8",
+                op_func=_rocm_aiter_gemm_a8w8_impl,
                 mutates_args=[],
-                fake_impl=_rocm_aiter_gemm_w8a8_fake,
+                fake_impl=_rocm_aiter_gemm_a8w8_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
             direct_register_custom_op(
-                op_name="rocm_aiter_gemm_w8a8_blockscale",
-                op_func=_rocm_aiter_gemm_w8a8_blockscale_impl,
+                op_name="rocm_aiter_gemm_a8w8_blockscale",
+                op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
                 mutates_args=[],
-                fake_impl=_rocm_aiter_gemm_w8a8_blockscale_fake,
+                fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
                 dispatch_key=current_platform.dispatch_key,
             )
 
@@ -606,7 +612,7 @@ class rocm_aiter_ops:
         return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
 
     @staticmethod
-    def gemm_w8a8(
+    def gemm_a8w8(
         A: torch.Tensor,
         B: torch.Tensor,
         As: torch.Tensor,
@@ -614,10 +620,10 @@ class rocm_aiter_ops:
         bias: torch.Tensor | None = None,
         output_dtype: torch.dtype = torch.float16,
     ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8(A, B, As, Bs, bias, output_dtype)
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
 
     @staticmethod
-    def gemm_w8a8_blockscale(
+    def gemm_a8w8_blockscale(
         A: torch.Tensor,
         B: torch.Tensor,
         As: torch.Tensor,
@@ -625,7 +631,7 @@ class rocm_aiter_ops:
         block_size: list[int],
         output_dtype: torch.dtype = torch.float16,
     ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale(
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8_blockscale(
             A, B, As, Bs, output_dtype
         )
 
@@ -938,5 +944,4 @@ class rocm_aiter_ops:
         return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
 
 
-if IS_AITER_FOUND:
-    rocm_aiter_ops.register_ops_once()
+rocm_aiter_ops.register_ops_once()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 36aab503dee70..7d70c01cefbb6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -329,6 +329,7 @@ def rms_norm(
     out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
 ) -> None:
     # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
+    # If removed, also need to remove contiguous in MatcherRMSNorm
     input_contiguous = input.contiguous()
     torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
 
@@ -339,6 +340,34 @@ def fused_add_rms_norm(
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def fused_qk_norm_rope(
+    qkv: torch.Tensor,
+    num_heads_q: int,
+    num_heads_k: int,
+    num_heads_v: int,
+    head_dim: int,
+    eps: float,
+    q_weight: torch.Tensor,
+    k_weight: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    position_ids: torch.Tensor,
+) -> None:
+    torch.ops._C.fused_qk_norm_rope(
+        qkv,
+        num_heads_q,
+        num_heads_k,
+        num_heads_v,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        cos_sin_cache,
+        is_neox,
+        position_ids,
+    )
+
+
 def apply_repetition_penalties_torch(
     logits: torch.Tensor,
     prompt_mask: torch.Tensor,
@@ -2554,6 +2583,88 @@ def onednn_scaled_mm(
     return output
 
 
+def cpu_attn_get_scheduler_metadata(
+    num_reqs: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    seq_lens: torch.Tensor,
+    dtype: torch.dtype,
+    query_start_loc: torch.Tensor,
+    causal: bool,
+    sliding_window_size: int,
+    isa: str,
+    enable_kv_split: bool,
+) -> torch.Tensor:
+    sheduler_metadata = torch.ops._C.get_scheduler_metadata(
+        num_reqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        seq_lens,
+        dtype,
+        query_start_loc,
+        causal,
+        sliding_window_size,
+        isa,
+        enable_kv_split,
+    )
+    return sheduler_metadata
+
+
+def cpu_attn_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    isa: str,
+) -> None:
+    torch.ops._C.cpu_attn_reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        isa,
+    )
+
+
+def cpu_attention_with_kv_cache(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    output: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    causal: bool,
+    alibi_slopes: torch.Tensor | None,
+    sliding_window: tuple[int, int],
+    block_table: torch.Tensor,
+    softcap: float,
+    scheduler_metadata: torch.Tensor,
+    s_aux: torch.Tensor | None,
+) -> None:
+    torch.ops._C.cpu_attention_with_kv_cache(
+        query,
+        key_cache,
+        value_cache,
+        output,
+        query_start_loc,
+        seq_lens,
+        scale,
+        causal,
+        alibi_slopes,
+        sliding_window[0],
+        sliding_window[1],
+        block_table,
+        softcap,
+        scheduler_metadata,
+        s_aux,
+    )
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index b54eaf4e2872d..697beed918693 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -2,13 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Generic, Protocol, TypeVar
+from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
 import torch
 
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 
+if TYPE_CHECKING:
+    from vllm.config.cache import CacheDType
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.utils import KVCacheLayoutType
+
 
 class AttentionType:
     """
@@ -40,6 +45,9 @@ class AttentionBackend(ABC):
     # calling the custom op. When piecewise cudagraph is enabled, this
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
     @staticmethod
     @abstractmethod
@@ -51,10 +59,6 @@ class AttentionBackend(ABC):
     def get_impl_cls() -> type["AttentionImpl"]:
         raise NotImplementedError
 
-    @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return cls.get_impl_cls().get_supported_kernel_block_size()
-
     @staticmethod
     @abstractmethod
     def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
@@ -79,6 +83,136 @@ class AttentionBackend(ABC):
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        return (not supported_head_sizes) or head_size in supported_head_sizes
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool:
+        if kv_cache_dtype is None:
+            return True
+        return (not cls.supported_kv_cache_dtypes) or (
+            kv_cache_dtype in cls.supported_kv_cache_dtypes
+        )
+
+    @classmethod
+    def supports_block_size(cls, block_size: int | None) -> bool:
+        from vllm.config.cache import BlockSize
+
+        if block_size is None:
+            return True
+
+        valid_sizes = get_args(BlockSize)
+        if block_size not in valid_sizes:
+            return False
+
+        if not cls.supported_kernel_block_sizes:
+            return True
+
+        for supported_size in cls.supported_kernel_block_sizes:
+            is_multiple_of = (
+                isinstance(supported_size, MultipleOf)
+                and block_size % supported_size.base == 0
+            )
+            is_int_equal = (
+                isinstance(supported_size, int) and block_size == supported_size
+            )
+            if is_multiple_of or is_int_equal:
+                return True
+        return False
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> str | None:
+        return None
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> list[str]:
+        invalid_reasons = []
+        if not cls.supports_head_size(head_size):
+            invalid_reasons.append("head_size not supported")
+        if not cls.supports_dtype(dtype):
+            invalid_reasons.append("dtype not supported")
+        if not cls.supports_kv_cache_dtype(kv_cache_dtype):
+            invalid_reasons.append("kv_cache_dtype not supported")
+        if not cls.supports_block_size(block_size):
+            invalid_reasons.append("block_size not supported")
+        if use_mla != cls.is_mla():
+            if use_mla:
+                invalid_reasons.append("MLA not supported")
+            else:
+                invalid_reasons.append("non-MLA not supported")
+        if has_sink and not cls.supports_sink():
+            invalid_reasons.append("sink setting not supported")
+        if use_sparse != cls.is_sparse():
+            if use_sparse:
+                invalid_reasons.append("sparse not supported")
+            else:
+                invalid_reasons.append("non-sparse not supported")
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append("compute capability not supported")
+        combination_reason = cls.supports_combination(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+        )
+        if combination_reason is not None:
+            invalid_reasons.append(combination_reason)
+        return invalid_reasons
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return None
+
 
 class AttentionMetadata:
     pass
@@ -151,11 +285,6 @@ class AttentionImpl(ABC, Generic[T]):
     ) -> None:
         raise NotImplementedError
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # TODO: implement this function for all backends.
-        return [MultipleOf(1)]
-
     @abstractmethod
     def forward(
         self,
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 05d0159d08615..f07a6059be377 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -3,108 +3,193 @@
 """Attention backend registry"""
 
 import enum
+from collections.abc import Callable
+from typing import TYPE_CHECKING, cast
 
+from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
-class _Backend(enum.Enum):
-    FLASH_ATTN = enum.auto()
-    TRITON_ATTN = enum.auto()
-    XFORMERS = enum.auto()
-    ROCM_ATTN = enum.auto()
-    ROCM_AITER_MLA = enum.auto()
-    ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
-    TORCH_SDPA = enum.auto()
-    FLASHINFER = enum.auto()
-    FLASHINFER_MLA = enum.auto()
-    TRITON_MLA = enum.auto()
-    CUTLASS_MLA = enum.auto()
-    FLASHMLA = enum.auto()
-    FLASHMLA_SPARSE = enum.auto()
-    FLASH_ATTN_MLA = enum.auto()
-    PALLAS = enum.auto()
-    IPEX = enum.auto()
-    NO_ATTENTION = enum.auto()
-    FLEX_ATTENTION = enum.auto()
-    TREE_ATTN = enum.auto()
-    ROCM_AITER_UNIFIED_ATTN = enum.auto()
+logger = init_logger(__name__)
 
 
-BACKEND_MAP = {
-    _Backend.FLASH_ATTN: "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",  # noqa: E501
-    _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",  # noqa: E501
-    _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",  # noqa: E501
-    _Backend.ROCM_ATTN: "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend",  # noqa: E501
-    _Backend.ROCM_AITER_MLA: "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend",  # noqa: E501
-    _Backend.ROCM_AITER_FA: "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend",  # noqa: E501
-    _Backend.TORCH_SDPA: "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend",  # noqa: E501
-    _Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",  # noqa: E501
-    _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",  # noqa: E501
-    _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",  # noqa: E501
-    _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",  # noqa: E501
-    _Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",  # noqa: E501
-    _Backend.FLASHMLA_SPARSE: "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend",  # noqa: E501
-    _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",  # noqa: E501
-    _Backend.PALLAS: "vllm.v1.attention.backends.pallas.PallasAttentionBackend",  # noqa: E501
-    _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",  # noqa: E501
-    _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",  # noqa: E501
-    _Backend.ROCM_AITER_UNIFIED_ATTN: "vllm.v1.attention.backends.rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend",  # noqa: E501
-}
+class _AttentionBackendEnumMeta(enum.EnumMeta):
+    """Metaclass for AttentionBackendEnum to provide better error messages."""
+
+    def __getitem__(cls, name: str):
+        """Get backend by name with helpful error messages."""
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
+            valid_backends = ", ".join(m.name for m in members)
+            raise ValueError(
+                f"Unknown attention backend: '{name}'. "
+                f"Valid options are: {valid_backends}"
+            ) from None
 
 
-def register_attn_backend(backend: _Backend, class_path: str | None = None):
+class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
     """
-    Decorator: register a custom attention backend into BACKEND_MAPPING.
-    - If class_path is provided, use it.
-    - Otherwise, auto-generate from the class object.
-    Validation: only checks if 'backend' is a valid _Backend enum member.
-    Overwriting existing mappings is allowed. This enables other hardware
-    platforms to plug in custom out-of-tree backends.
-    """
-    if not isinstance(backend, _Backend):
-        raise ValueError(f"{backend} is not a valid _Backend enum value.")
 
-    def decorator(cls):
-        path = class_path or f"{cls.__module__}.{cls.__qualname__}"
-        BACKEND_MAP[backend] = path
+    FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
+    ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+    ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_FA = (
+        "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+    )
+    TORCH_SDPA = ""  # this tag is only used for ViT
+    FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+    FLASHINFER_MLA = (
+        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
+    )
+    TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+    CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
+    FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
+    FLASHMLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
+    )
+    FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
+    PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+    IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend"
+    NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend"
+    FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+    TREE_ATTN = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"
+    ROCM_AITER_UNIFIED_ATTN = (
+        "vllm.v1.attention.backends.rocm_aiter_unified_attn."
+        "RocmAiterUnifiedAttentionBackend"
+    )
+    CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _OVERRIDES.pop(self, None)
+
+
+_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+
+
+def register_backend(
+    backend: AttentionBackendEnum, class_path: str | None = None
+) -> Callable[[type], type]:
+    """Register or override a backend implementation.
+
+    Args:
+        backend: The AttentionBackendEnum member to register
+        class_path: Optional class path. If not provided and used as
+            decorator, will be auto-generated from the class.
+
+    Returns:
+        Decorator function if class_path is None, otherwise a no-op
+
+    Examples:
+        # Override an existing backend
+        @register_backend(AttentionBackendEnum.FLASH_ATTN)
+        class MyCustomFlashAttn:
+            ...
+
+        # Register a custom third-party backend
+        @register_backend(AttentionBackendEnum.CUSTOM)
+        class MyCustomBackend:
+            ...
+
+        # Direct registration
+        register_backend(
+            AttentionBackendEnum.CUSTOM,
+            "my.module.MyCustomBackend"
+        )
+    """
+
+    def decorator(cls: type) -> type:
+        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
         return cls
 
+    if class_path is not None:
+        _OVERRIDES[backend] = class_path
+        return lambda x: x
+
     return decorator
 
 
-def backend_to_class_str(backend: _Backend) -> str:
-    """Get the backend class string
+# Backwards compatibility alias for plugins
+class _BackendMeta(type):
+    """Metaclass to provide deprecation warnings when accessing _Backend."""
 
-    Args:
-        backend: The backend enum value
+    def __getattribute__(cls, name: str):
+        if name not in ("__class__", "__mro__", "__name__"):
+            logger.warning(
+                "_Backend has been renamed to AttentionBackendEnum. "
+                "Please update your code to use AttentionBackendEnum instead. "
+                "_Backend will be removed in a future release."
+            )
+        return getattr(AttentionBackendEnum, name)
 
-    Returns:
-        The backend class string
+    def __getitem__(cls, name: str):
+        logger.warning(
+            "_Backend has been renamed to AttentionBackendEnum. "
+            "Please update your code to use AttentionBackendEnum instead. "
+            "_Backend will be removed in a future release."
+        )
+        return AttentionBackendEnum[name]
+
+
+class _Backend(metaclass=_BackendMeta):
+    """Deprecated: Use AttentionBackendEnum instead.
+
+    This class is provided for backwards compatibility with plugins
+    and will be removed in a future release.
     """
-    return BACKEND_MAP[backend]
 
-
-def backend_to_class(backend: _Backend) -> type:
-    """Get the backend class.
-
-    Args:
-        backend: The backend enum value
-
-    Returns:
-        The backend class
-    """
-    backend_class_name = backend_to_class_str(backend)
-    return resolve_obj_by_qualname(backend_class_name)
-
-
-def backend_name_to_enum(backend_name: str) -> _Backend | None:
-    """
-    Convert a string backend name to a _Backend enum value.
-
-    Returns:
-        _Backend: enum value if backend_name is a valid in-tree type
-        None: otherwise it's an invalid in-tree type or an out-of-tree platform
-              is loaded.
-    """
-    assert backend_name is not None
-    return _Backend[backend_name] if backend_name in _Backend.__members__ else None
+    pass
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 96272981692c0..487bba76babf1 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -12,17 +12,13 @@ import torch.nn.functional as F
 import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
+from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.config.multimodal import MultiModalConfig
 from vllm.config.vllm import VllmConfig
-from vllm.distributed.kv_transfer import (
-    get_kv_transfer_group,
-    has_kv_transfer_group,
-    is_v1_kv_transfer_group,
-)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
@@ -99,40 +95,44 @@ def check_upstream_fa_availability(dtype: torch.dtype):
 
 
 def maybe_get_vit_flash_attn_backend(
-    attn_backend: _Backend,
+    attn_backend: AttentionBackendEnum,
     use_upstream_fa: bool,
-    attn_backend_override: _Backend | None = None,
-) -> tuple[_Backend, Callable | None]:
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> tuple[AttentionBackendEnum, Callable | None]:
     if current_platform.is_rocm():
         if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-            attn_backend = _Backend.ROCM_AITER_FA
+            attn_backend = AttentionBackendEnum.ROCM_AITER_FA
 
         elif (
             check_upstream_fa_availability(torch.get_default_dtype())
             and on_gfx9()
             and attn_backend_override is None
         ):
-            attn_backend = _Backend.FLASH_ATTN
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
         else:
-            return _Backend.TORCH_SDPA, None
+            return AttentionBackendEnum.TORCH_SDPA, None
 
     elif current_platform.is_cuda():
-        if attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            attn_backend = _Backend.FLASH_ATTN
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
     elif current_platform.is_xpu():
-        assert attn_backend == _Backend.FLASH_ATTN, (
+        assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
             "XPU platform only supports FLASH_ATTN as vision attention backend."
         )
         use_upstream_fa = False
     else:
-        return _Backend.TORCH_SDPA, None
+        return AttentionBackendEnum.TORCH_SDPA, None
 
-    if attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
-        if attn_backend == _Backend.ROCM_AITER_FA:
+    if attn_backend in {
+        AttentionBackendEnum.FLASH_ATTN,
+        AttentionBackendEnum.ROCM_AITER_FA,
+    }:
+        if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
             if use_upstream_fa:
@@ -309,7 +309,7 @@ class Attention(nn.Module, AttentionLayerBase):
             kv_sharing_target_layer_name,
             **extra_impl_args,
         )
-        self.backend = backend_name_to_enum(self.attn_backend.get_name())
+        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -530,13 +530,13 @@ class MultiHeadAttention(nn.Module):
             backend
             if backend
             in {
-                _Backend.TORCH_SDPA,
-                _Backend.XFORMERS,
-                _Backend.PALLAS,
-                _Backend.ROCM_AITER_FA,
-                _Backend.FLASH_ATTN,
+                AttentionBackendEnum.TORCH_SDPA,
+                AttentionBackendEnum.XFORMERS,
+                AttentionBackendEnum.PALLAS,
+                AttentionBackendEnum.ROCM_AITER_FA,
+                AttentionBackendEnum.FLASH_ATTN,
             }
-            else _Backend.TORCH_SDPA
+            else AttentionBackendEnum.TORCH_SDPA
         )
 
         self.attn_backend, self._flash_attn_varlen_func = (
@@ -547,17 +547,23 @@ class MultiHeadAttention(nn.Module):
             )
         )
 
-        if self.attn_backend == _Backend.XFORMERS and not check_xformers_availability():
-            self.attn_backend = _Backend.TORCH_SDPA
+        if (
+            self.attn_backend == AttentionBackendEnum.XFORMERS
+            and not check_xformers_availability()
+        ):
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
         # this condition is just to make sure that the
         # use_upstream_fa in the log is correct
-        if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
             use_upstream_fa = True
 
         logger.info_once(
@@ -606,17 +612,17 @@ class MultiHeadAttention(nn.Module):
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(
                 query, key, value, scale=self.scale
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
             out = out.transpose(1, 2)
-        elif self.attn_backend == _Backend.PALLAS:
+        elif self.attn_backend == AttentionBackendEnum.PALLAS:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             from torch_xla.experimental.custom_kernel import flash_attention
 
@@ -832,37 +838,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         )
 
 
-def wait_for_kv_layer_from_connector(layer_name: str):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
-
-    connector = get_kv_transfer_group()
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
-        return
-    assert isinstance(attn_metadata, dict)
-    connector.wait_for_layer_load(layer_name)
-
-
-def maybe_save_kv_layer_to_connector(
-    layer_name: str,
-    kv_cache_layer: list[torch.Tensor],
-):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
-
-    connector = get_kv_transfer_group()
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
-        return
-    assert isinstance(attn_metadata, dict)
-    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata[layer_name])
-
-
 def maybe_calc_kv_scales(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -897,23 +872,46 @@ direct_register_custom_op(
 )
 
 
+def get_attention_context(
+    layer_name: str,
+) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]:
+    """Extract attention context for a given layer.
+
+    This helper function extracts the attention metadata, attention layer
+    instance, and KV cache tensor for a specific layer.
+
+    Args:
+        layer_name: The name/identifier of the attention layer.
+
+    Returns:
+        A tuple containing:
+        - attn_metadata: Attention metadata for this specific layer, or None if
+            no metadata available
+        - attn_layer: The attention layer instance (Attention or MLAAttention)
+        - kv_cache: The KV cache tensor for current virtual engine
+
+        Note: attn_metadata may be None, but attn_layer and kv_cache are always
+        extracted from the forward context.
+    """
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if isinstance(attn_metadata, dict):
+        attn_metadata = attn_metadata[layer_name]
+    attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name]
+    kv_cache = attn_layer.kv_cache[forward_context.virtual_engine]
+    return attn_metadata, attn_layer, kv_cache
+
+
+@maybe_transfer_kv_layer
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
-    wait_for_kv_layer_from_connector(layer_name)
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
     return output
 
 
@@ -933,6 +931,7 @@ direct_register_custom_op(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_attention_with_output(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -942,13 +941,7 @@ def unified_attention_with_output(
     output_scale: torch.Tensor | None = None,
     output_block_scale: torch.Tensor | None = None,
 ) -> None:
-    wait_for_kv_layer_from_connector(layer_name)
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     self.impl.forward(
         self,
         query,
@@ -961,8 +954,6 @@ def unified_attention_with_output(
         output_block_scale=output_block_scale,
     )
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
-
 
 def unified_attention_with_output_fake(
     query: torch.Tensor,
@@ -984,23 +975,16 @@ direct_register_custom_op(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_mla_attention(
     q: torch.Tensor,
     kv_c_normed: torch.Tensor,
     k_pe: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
-    wait_for_kv_layer_from_connector(layer_name)
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self: MLAAttention = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata)
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
     return output
 
 
@@ -1022,6 +1006,7 @@ direct_register_custom_op(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_mla_attention_with_output(
     q: torch.Tensor,
     kv_c_normed: torch.Tensor,
@@ -1031,13 +1016,7 @@ def unified_mla_attention_with_output(
     output_scale: torch.Tensor | None = None,
     output_block_scale: torch.Tensor | None = None,
 ) -> None:
-    wait_for_kv_layer_from_connector(layer_name)
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self: MLAAttention = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     self.impl.forward(
         self,
         q,
@@ -1050,8 +1029,6 @@ def unified_mla_attention_with_output(
         output_block_scale=output_block_scale,
     )
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
-
 
 def unified_mla_attention_with_output_fake(
     q: torch.Tensor,
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 5532ce80d7f15..f144e8435b6cf 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -32,7 +32,7 @@ def create_chunked_local_attention_backend(
     underlying_builder = underlying_attn_backend.get_builder_cls()
 
     class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
-        cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+        _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
 
         def build(
             self,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index addf1d9dea73e..f101d5c4a9278 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -335,216 +335,6 @@ def _fwd_kernel(
     return
 
 
-@triton.jit
-def _fwd_kernel_flash_attn_v2(
-    Q,
-    K,
-    V,
-    K_cache,
-    V_cache,
-    B_Loc,
-    sm_scale,
-    B_Start_Loc,
-    B_Seqlen,
-    B_Ctxlen,
-    block_size,
-    x,
-    Out,
-    stride_b_loc_b,
-    stride_b_loc_s,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_obs,
-    stride_oh,
-    stride_od,
-    stride_k_cache_bs,
-    stride_k_cache_h,
-    stride_k_cache_d,
-    stride_k_cache_bl,
-    stride_k_cache_x,
-    stride_v_cache_bs,
-    stride_v_cache_h,
-    stride_v_cache_d,
-    stride_v_cache_bl,
-    num_queries_per_kv: int,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    cur_batch = tl.program_id(0)
-    cur_head = tl.program_id(1)
-    start_m = tl.program_id(2)
-
-    cur_kv_head = cur_head // num_queries_per_kv
-
-    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-    block_start_loc = BLOCK_M * start_m
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
-        + cur_head * stride_qh
-        + offs_d[None, :] * stride_qd
-    )
-
-    q = tl.load(
-        Q + off_q,
-        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-        other=0.0,
-    )
-
-    # # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        bn = tl.load(
-            B_Loc
-            + cur_batch * stride_b_loc_b
-            + ((start_n + offs_n) // block_size) * stride_b_loc_s,
-            mask=(start_n + offs_n) < cur_batch_ctx_len,
-            other=0,
-        ).to(tl.int64)
-        off_k = (
-            bn[None, :] * stride_k_cache_bs
-            + cur_kv_head * stride_k_cache_h
-            + (offs_d[:, None] // x) * stride_k_cache_d
-            + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl
-            + (offs_d[:, None] % x) * stride_k_cache_x
-        )
-        off_v = (
-            bn[:, None] * stride_v_cache_bs
-            + cur_kv_head * stride_v_cache_h
-            + offs_d[None, :] * stride_v_cache_d
-            + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl
-        )
-        k = tl.load(
-            K_cache + off_k,
-            mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-            other=0.0,
-        )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk = tl.where(
-            (start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")
-        )
-        qk *= sm_scale
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(
-            V_cache + off_v,
-            mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    off_k = (
-        offs_n[None, :] * stride_kbs
-        + cur_kv_head * stride_kh
-        + offs_d[:, None] * stride_kd
-    )
-    off_v = (
-        offs_n[:, None] * stride_vbs
-        + cur_kv_head * stride_vh
-        + offs_d[None, :] * stride_vd
-    )
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-
-    block_mask = tl.where(block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(
-            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
-            mask=(start_n + offs_n[None, :]) < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(
-            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
-            mask=(start_n + offs_n[:, None]) < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    # acc /= l_i[:, None]
-    # initialize pointers to output
-    off_o = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
-        + cur_head * stride_oh
-        + offs_d[None, :] * stride_od
-    )
-    out_ptrs = Out + off_o
-    tl.store(
-        out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len
-    )
-    return
-
-
 @triton.jit
 def _fwd_kernel_alibi(
     Q,
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
deleted file mode 100644
index c0ab35d07b1fe..0000000000000
--- a/vllm/attention/ops/triton_flash_attention.py
+++ /dev/null
@@ -1,932 +0,0 @@
-#!/usr/bin/env python
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Fused Attention
-===============
-
-This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
-(https://tridao.me/publications/flash2/flash2.pdf)
-Credits: OpenAI kernel team, AMD ML Frameworks Triton team
-
-Features supported:
-
-1) Fwd with causal masking
-2) Any sequence lengths without padding (currently fwd kernel only)
-3) Support for different sequence lengths for q and k
-4) Nested tensor API currently does not support dropout or bias.
-
-Not currently supported:
-
-1) Non power of two head dims
-
-"""
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.triton_utils import tl, triton
-
-# Avoid misleading ROCm warning.
-if current_platform.is_rocm():
-    from vllm.platforms.rocm import on_gfx1x
-else:
-    on_gfx1x = lambda *args, **kwargs: False
-
-torch_dtype: tl.constexpr = torch.float16
-
-
-@triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
-
-
-@triton.jit
-def max_fn(x, y):
-    return tl.math.max(x, y)
-
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(
-        philox_seed, philox_offset, dropout_p, m, n, stride
-    ).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
-
-@triton.jit
-def load_fn(block_ptr, first, second, pad):
-    if first and second:
-        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
-    elif first:
-        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
-    elif second:
-        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
-    else:
-        tensor = tl.load(block_ptr)
-    return tensor
-
-
-@triton.jit
-def _attn_fwd_inner(
-    acc,
-    l_i,
-    m_i,
-    q,
-    K_block_ptr,
-    V_block_ptr,
-    start_m,
-    actual_seqlen_k,
-    dropout_p,
-    philox_seed,
-    batch_philox_offset,
-    encoded_softmax_block_ptr,
-    block_min,
-    block_max,
-    offs_n_causal,
-    masked_blocks,
-    n_extra_tokens,
-    bias_ptr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    OFFS_M: tl.constexpr,
-    OFFS_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    MASK_STEPS: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    PADDED_HEAD: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    qk_scale,
-    p_descale,
-):
-    # loop over k, v, and update accumulator
-    for start_n in range(block_min, block_max, BLOCK_N):
-        # For padded blocks, we will overrun the tensor size if
-        # we load all BLOCK_N. For others, the blocks are all within range.
-        k = load_fn(
-            K_block_ptr,
-            PADDED_HEAD,
-            MASK_STEPS and (n_extra_tokens != 0),
-            "zero",
-        )
-        if PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        # We start from end of seqlen_k so only the first iteration would need
-        # to be checked for padding if it is not a multiple of block_n
-        # TODO: This can be optimized to only be true for the padded block.
-        if MASK_STEPS:  # noqa: SIM102
-            # If this is the last block / iteration, we want to
-            # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
-            # if not is_modulo_mn. last step might get wasted but that is okay.
-            # check if this masking works for that case.
-            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
-                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
-                size_n = start_n + OFFS_N[None, :]
-                mask = size_n < boundary_m[:, None]
-                qk = tl.where(mask, qk, float("-inf"))
-        if IS_CAUSAL:
-            causal_boundary = start_n + offs_n_causal
-            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
-            qk = tl.where(causal_mask, qk, float("-inf"))
-        # -- compute qk ----
-        qk += tl.dot(q, k)
-        if USE_FP8:
-            qk *= qk_scale
-        if bias_ptr is not None:
-            bias = load_fn(
-                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
-            )
-            # While bias is added after multiplying qk with sm_scale, our
-            # optimization to use 2^x instead of e^x results in an additional
-            # scale factor of log2(e) which we must also multiply the bias with.
-            qk += bias * 1.44269504089
-        m_ij = tl.maximum(m_i, tl.max(qk, 1))
-        qk = qk - m_ij[:, None]
-        p = tl.math.exp2(qk)
-
-        # CAVEAT: Must update l_ij before applying dropout
-        l_ij = tl.sum(p, 1)
-        if ENABLE_DROPOUT:
-            philox_offset = (
-                batch_philox_offset
-                + start_m * BLOCK_M * actual_seqlen_k
-                + start_n
-                - BLOCK_N
-            )
-            keep = dropout_mask(
-                philox_seed,
-                philox_offset,
-                dropout_p,
-                BLOCK_M,
-                BLOCK_N,
-                actual_seqlen_k,
-            )
-            if RETURN_ENCODED_SOFTMAX:
-                tl.store(
-                    encoded_softmax_block_ptr,
-                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
-                )
-            p = tl.where(keep, p, 0.0)
-        elif RETURN_ENCODED_SOFTMAX:
-            tl.store(
-                encoded_softmax_block_ptr,
-                p.to(encoded_softmax_block_ptr.type.element_ty),
-            )
-        # -- update output accumulator --
-        alpha = tl.math.exp2(m_i - m_ij)
-        acc = acc * alpha[:, None]
-        if not PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        # -- update m_i and l_i
-        l_i = l_i * alpha + l_ij
-        # update m_i and l_i
-        m_i = m_ij
-
-        if USE_FP8:
-            p *= p_descale
-
-        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
-
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, BLOCK_N)
-            )
-    return acc, l_i, m_i
-
-
-def get_cdna_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 256, "BLOCK_N": 64, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 256, "BLOCK_N": 128, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 1, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "PRE_LOAD_V": True},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        # TODO: This config fails with head_size not pow2 with data mismatches.
-        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
-        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         "BLOCK_M": 16,
-        #         "BLOCK_N": 16,
-        #         "waves_per_eu": 1,
-        #         "PRE_LOAD_V": False,
-        #     },
-        #     num_stages=1,
-        #     num_warps=4,
-        # ),
-    ], ["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL", "USE_FP8"]
-
-
-def get_rdna_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 16, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 16, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 4,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 2,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # # Fall-back config.
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 1,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-    ], ["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL", "USE_FP8"]
-
-
-def get_autotune_configs():
-    if on_gfx1x():
-        return get_rdna_autotune_configs()
-    else:
-        return get_cdna_autotune_configs()
-
-
-autotune_configs, autotune_keys = get_autotune_configs()
-
-float8_info = torch.finfo(current_platform.fp8_dtype())
-
-
-@triton.autotune(
-    configs=autotune_configs,
-    key=autotune_keys,
-)
-@triton.jit
-def attn_fwd(
-    Q,
-    K,
-    V,
-    bias,
-    sm_scale,
-    q_scale,
-    k_scale,
-    v_scale,
-    p_scale,
-    p_descale,
-    o_descale,
-    L,
-    Out,
-    stride_qz: tl.int64,
-    stride_qh: tl.int64,
-    stride_qm: tl.int64,
-    stride_qk: tl.int64,
-    stride_kz: tl.int64,
-    stride_kh: tl.int64,
-    stride_kn: tl.int64,
-    stride_kk: tl.int64,
-    stride_vz: tl.int64,
-    stride_vh: tl.int64,
-    stride_vk: tl.int64,
-    stride_vn: tl.int64,
-    stride_oz: tl.int64,
-    stride_oh: tl.int64,
-    stride_om: tl.int64,
-    stride_on: tl.int64,
-    stride_bz: tl.int64,
-    stride_bh: tl.int64,
-    stride_bm: tl.int64,
-    stride_bn: tl.int64,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    dropout_p,
-    philox_seed,
-    philox_offset_base,
-    encoded_softmax,
-    HQ: tl.constexpr,
-    HK: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    MAX_SEQLENS_Q: tl.constexpr,
-    MAX_SEQLENS_K: tl.constexpr,
-    VARLEN: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    USE_FP8_OUT: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    FP8_MIN: tl.constexpr = float8_info.min,
-    FP8_MAX: tl.constexpr = float8_info.max,
-):
-    start_m = tl.program_id(0)
-    off_h_q = tl.program_id(1)
-    off_z = tl.program_id(2)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    if VARLEN:
-        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
-        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
-        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
-        # small for all start_m so for those we return early.
-        if start_m * BLOCK_M > seqlen_q:
-            return
-        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
-        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
-        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
-    else:
-        cu_seqlens_q_start = 0
-        cu_seqlens_k_start = 0
-        seqlen_q = MAX_SEQLENS_Q
-        seqlen_k = MAX_SEQLENS_K
-
-    # Now we compute whether we need to exit early due to causal masking.
-    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
-    # are completely masked, resulting in 0s written to the output, and
-    # inf written to LSE. We don't need to do any GEMMs in this case.
-    # This block of code determines what N is, and if this WG is operating
-    # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-    if IS_CAUSAL:
-        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-        # If seqlen_q != seqlen_k, attn scores are rectangular which means
-        # the causal mask boundary is bottom right aligned, and ends at either
-        # the top edge (seqlen_q < seqlen_k) or left edge.
-        # This captures the decrease in n_blocks if we have a rectangular attn
-        # matrix
-        n_blocks_seqlen = cdiv_fn(
-            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
-        )
-        # This is what adjusts the block_max for the current WG, only
-        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
-        n_blocks = min(n_blocks, n_blocks_seqlen)
-        # If we have no blocks after adjusting for seqlen deltas, this WG is
-        # part of the blocks that are all 0. We exit early.
-        if n_blocks <= 0:
-            o_offset = (
-                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-            )
-            O_block_ptr = tl.make_block_ptr(
-                base=Out + o_offset,
-                shape=(seqlen_q, BLOCK_DMODEL),
-                strides=(stride_om, stride_on),
-                offsets=(start_m * BLOCK_M, 0),
-                block_shape=(BLOCK_M, BLOCK_DMODEL),
-                order=(1, 0),
-            )
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
-            # We still need to write 0s to the result
-            # tl.store(O_block_ptr,
-            # acc.to(Out.type.element_ty), boundary_check=(0,1))
-            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
-            #          + offs_m
-            # We store inf to LSE, not -inf because in the bwd pass,
-            # we subtract this
-            # from qk which makes it -inf, such that exp(qk - inf) = 0
-            # for these masked blocks.
-            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            # tl.store(l_ptrs, l)
-            # TODO: Should dropout and return encoded softmax be handled here?
-            return
-
-    # If MQA / GQA, set the K and V head offsets appropriately.
-    GROUP_SIZE: tl.constexpr = HQ // HK
-    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-
-    n_extra_tokens = 0
-    if seqlen_k < BLOCK_N:
-        n_extra_tokens = BLOCK_N - seqlen_k
-    elif seqlen_k % BLOCK_N:
-        n_extra_tokens = seqlen_k % BLOCK_N
-    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
-
-    # Compute pointers for all the tensors used in this kernel.
-    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    if BIAS_TYPE != 0:
-        bias_ptr = tl.make_block_ptr(
-            base=bias + off_h_q * stride_bh,
-            shape=(seqlen_q, seqlen_k),
-            strides=(stride_bm, stride_bn),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        bias_ptr = None
-    if ENABLE_DROPOUT:
-        batch_philox_offset = (
-            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
-        )
-    else:
-        batch_philox_offset = 0
-    # We can ask to return the dropout mask without actually doing any dropout.
-    # In this case, we return an invalid pointer so indicate the mask is not i
-    # valid.
-    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
-    if RETURN_ENCODED_SOFTMAX:
-        encoded_softmax_block_ptr = tl.make_block_ptr(
-            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
-            shape=(seqlen_q, seqlen_k),
-            strides=(seqlen_k, 1),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        encoded_softmax_block_ptr = 0
-    # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
-    # have native e^x support in HW.
-    qk_scale = sm_scale * 1.44269504089
-    # Q is loaded once at the beginning and shared by all N blocks.
-    q = load_fn(Q_block_ptr, True, padded_head, "zero")
-    if not USE_FP8:
-        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
-        acc_scale = 1.0
-    else:
-        qk_scale *= q_scale * k_scale
-        acc_scale = p_scale * v_scale
-
-    # Here we compute how many full and masked blocks we have.
-    padded_block_k = n_extra_tokens != 0
-    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-    if IS_CAUSAL:
-        # There are always at least BLOCK_M // BLOCK_N masked blocks.
-        # Additionally there might be one more due to dissimilar seqlens.
-        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-    else:
-        # Padding on Q does not need to be masked in the FA loop.
-        masked_blocks = padded_block_k
-    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
-    # block. In this case we might exceed n_blocks so pick the min.
-    masked_blocks = min(masked_blocks, n_blocks)
-    n_full_blocks = n_blocks - masked_blocks
-    block_min = 0
-    block_max = n_blocks * BLOCK_N
-    # Compute for full blocks. Here we set causal to false regardless of its
-    # value because there is no masking. Similarly we do not need padding.
-    if n_full_blocks > 0:
-        block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-            block_min,
-            block_max,
-            0,
-            0,
-            0,
-            bias_ptr,
-            # IS_CAUSAL, ....
-            False,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            False,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-        block_min = block_max
-        block_max = n_blocks * BLOCK_N
-
-    tl.debug_barrier()
-    # Remaining blocks, if any, are full / not masked.
-    if masked_blocks > 0:
-        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
-        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, n_full_blocks)
-            )
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            block_min,
-            block_max,
-            offs_n_causal,
-            masked_blocks,
-            n_extra_tokens,
-            bias_ptr,
-            IS_CAUSAL,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            True,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-    # epilogue
-
-    if USE_FP8:
-        acc *= acc_scale
-    acc = acc / l_i[:, None]
-    if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
-    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
-    # then we have one block with a row of all NaNs which come from computing
-    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
-    # and store 0s where there are NaNs as these rows should've been zeroed out.
-    end_m_idx = (start_m + 1) * BLOCK_M
-    start_m_idx = start_m * BLOCK_M
-    causal_start_idx = seqlen_q - seqlen_k
-    if USE_FP8_OUT:
-        acc *= o_descale
-        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
-    acc = acc.to(Out.type.element_ty)
-    if IS_CAUSAL:  # noqa: SIM102
-        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
-            out_mask_boundary = tl.full(
-                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
-            )
-            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
-            z = tl.zeros((1,), tl.float32)
-            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
-    # write back LSE
-    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
-    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
-    # few rows. This is only true for the last M block. For others,
-    # overflow_size will be -ve
-    # overflow_size = end_m_idx - seqlen_q
-    # if overflow_size > 0:
-    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-    #    # This is a > check because mask being 0 blocks the store.
-    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    # else:
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-    # write back O
-    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + o_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    # Need boundary check on this to make sure the padding from the
-    # Q and KV tensors in both dims are not part of what we store back.
-    # TODO: Do the boundary check optionally.
-    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
-
-
-def check_args(
-    q,
-    k,
-    v,
-    o,
-    varlen=True,
-    max_seqlens=None,
-    cu_seqlens_q=None,
-    cu_seqlens_k=None,
-):
-    assert q.dim() == k.dim() and q.dim() == v.dim()
-    if varlen:
-        assert q.dim() == 3
-        total_q, nheads_q, head_size = q.shape
-        total_k, nheads_k, _ = k.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-    else:
-        assert q.dim() == 4
-        batch, nheads_q, seqlen_q, head_size = q.shape
-        _, nheads_k, seqlen_k, _ = k.shape
-        assert max_seqlens > 0
-    assert k.shape == v.shape
-    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-    # TODO: Change assert if we support qkl f8 and v f16
-    assert q.dtype == k.dtype and q.dtype == v.dtype
-    assert head_size <= 256
-    assert o.shape == q.shape
-    assert (nheads_q % nheads_k) == 0
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        o,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlens_q,
-        max_seqlens_k,
-        causal=False,
-        sm_scale=1.0,
-        bias=None,
-        fp8_scales=None,
-        fp8_out_scale=None,
-    ):
-        if fp8_scales is not None:
-            use_fp8 = True
-            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
-            float8 = current_platform.fp8_dtype()
-
-            def check_and_convert(t, scale):
-                if t.dtype != float8:
-                    descale = 1.0 / scale
-                    ts = (t * descale).clamp(min=float8_info.min, max=float8_info.max)
-                    return ts.to(float8)
-                else:
-                    return t
-
-            q = check_and_convert(q, q_scale)
-            k = check_and_convert(k, k_scale)
-            v = check_and_convert(v, v_scale)
-        else:
-            use_fp8 = False
-            q_scale = k_scale = v_scale = p_scale = 1.0
-
-        if o is None:
-            o = torch.empty_like(q, dtype=v.dtype)
-
-        check_args(
-            q,
-            k,
-            v,
-            o,
-            varlen=True,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-        )
-        if True:  # varlen
-            total_q, nheads_q, head_size = q.shape
-            total_k, nheads_k, _ = k.shape
-            batch = len(cu_seqlens_q) - 1
-            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
-            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
-            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
-            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
-        else:
-            batch, seqlen_q, nheads_q, head_size = q.shape
-            _, seqlen_k, nheads_k, _ = k.shape
-            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
-            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
-            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
-            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
-
-        # Get closest power of 2 over or equal to 32.
-        unpadded_head_dims = {32, 64, 128, 256}
-        if head_size not in unpadded_head_dims:
-            padded_d_model = None
-            for i in unpadded_head_dims:
-                if i > head_size:
-                    padded_d_model = i
-                    break
-            assert padded_d_model is not None
-        else:
-            padded_d_model = head_size
-
-        grid = lambda META: (
-            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
-            nheads_q,
-            batch,
-        )
-
-        encoded_softmax = None
-
-        # Seed the RNG so we get reproducible results for testing.
-        philox_seed = 0x1BF52
-        philox_offset = 0x1D4B42
-
-        if bias is not None:
-            bias_strides = (
-                bias.stride(0),
-                bias.stride(1),
-                bias.stride(2),
-                bias.stride(3),
-            )
-        else:
-            bias_strides = (0, 0, 0, 0)
-
-        p_descale = 1.0 / p_scale
-        o_descale = 1.0 / fp8_out_scale.item() if fp8_out_scale is not None else 1.0
-
-        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
-        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
-
-        attn_fwd[grid](
-            q,
-            k,
-            v,
-            bias,
-            sm_scale,
-            q_scale,
-            k_scale,
-            v_scale,
-            p_scale,
-            p_descale,
-            o_descale,
-            None,
-            o,
-            *q_strides,
-            *k_strides,
-            *v_strides,
-            *o_strides,
-            *bias_strides,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            dropout_p=0.0,
-            philox_seed=philox_seed,
-            philox_offset_base=philox_offset,
-            encoded_softmax=encoded_softmax,
-            HQ=nheads_q,
-            HK=nheads_k,
-            ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=arg_max_seqlens_q,
-            MAX_SEQLENS_K=arg_max_seqlens_k,
-            IS_CAUSAL=causal,
-            VARLEN=True,
-            BLOCK_DMODEL=padded_d_model,
-            BIAS_TYPE=0 if bias is None else 1,
-            ENABLE_DROPOUT=False,
-            RETURN_ENCODED_SOFTMAX=False,
-            USE_FP8=use_fp8,
-            USE_FP8_OUT=fp8_out_scale is not None,
-        )
-
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = head_size
-        ctx.causal = causal
-        ctx.dropout_p = 0.0
-        ctx.philox_seed = philox_seed
-        ctx.philox_offset = philox_offset
-        ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = False
-        return o, encoded_softmax
-
-
-triton_attention = _attention.apply
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 9c26a8d40edaf..262cdf0e575b0 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import os
 from collections.abc import Generator
 from contextlib import contextmanager
-from dataclasses import dataclass
 from functools import cache
+from typing import cast, get_args
 
 import torch
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.utils import STR_BACKEND_ENV_VAR
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -19,18 +21,18 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 logger = init_logger(__name__)
 
 
-def get_env_variable_attn_backend() -> _Backend | None:
+def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     """
     Get the backend override specified by the vLLM attention
     backend environment variable, if one is specified.
 
     Returns:
 
-    * _Backend enum value if an override is specified
+    * AttentionBackendEnum value if an override is specified
     * None otherwise
     """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else backend_name_to_enum(backend_name)
+    return None if backend_name is None else AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
@@ -40,10 +42,10 @@ def get_env_variable_attn_backend() -> _Backend | None:
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
 # VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: _Backend | None = None
+forced_attn_backend: AttentionBackendEnum | None = None
 
 
-def global_force_attn_backend(attn_backend: _Backend | None) -> None:
+def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None:
     """
     Force all attention operations to use a specified backend.
 
@@ -58,7 +60,7 @@ def global_force_attn_backend(attn_backend: _Backend | None) -> None:
     forced_attn_backend = attn_backend
 
 
-def get_global_forced_attn_backend() -> _Backend | None:
+def get_global_forced_attn_backend() -> AttentionBackendEnum | None:
     """
     Get the currently-forced choice of attention backend,
     or None if auto-selection is currently enabled.
@@ -66,78 +68,28 @@ def get_global_forced_attn_backend() -> _Backend | None:
     return forced_attn_backend
 
 
-@dataclass(frozen=True)
-class _IsSupported:
-    can_import: bool
-    head_size: bool
-    dtype: bool
-
-    def __bool__(self) -> bool:
-        return self.can_import and self.head_size and self.dtype
-
-
-def is_attn_backend_supported(
-    attn_backend: str | type[AttentionBackend],
-    head_size: int,
-    dtype: torch.dtype,
-    *,
-    allow_import_error: bool = True,
-) -> _IsSupported:
-    if isinstance(attn_backend, str):
-        try:
-            attn_backend = resolve_obj_by_qualname(attn_backend)
-        except ImportError:
-            if not allow_import_error:
-                raise
-
-            return _IsSupported(can_import=False, head_size=False, dtype=False)
-
-    assert isinstance(attn_backend, type)
-
-    # TODO: Update the interface once V0 is removed
-    if get_supported_head_sizes := getattr(
-        attn_backend, "get_supported_head_sizes", None
-    ):
-        is_head_size_supported = head_size in get_supported_head_sizes()
-    elif validate_head_size := getattr(attn_backend, "validate_head_size", None):
-        try:
-            validate_head_size(head_size)
-            is_head_size_supported = True
-        except Exception:
-            is_head_size_supported = False
-    else:
-        raise NotImplementedError(
-            f"{attn_backend.__name__} does not support head size validation"
-        )
-
-    if get_supported_dtypes := getattr(attn_backend, "get_supported_dtypes", None):
-        is_dtype_supported = dtype in get_supported_dtypes()
-    else:
-        raise NotImplementedError(
-            f"{attn_backend.__name__} does not support dtype validation"
-        )
-
-    return _IsSupported(
-        can_import=True,
-        head_size=is_head_size_supported,
-        dtype=is_dtype_supported,
-    )
-
-
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
     kv_cache_dtype: str | None,
-    block_size: int,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
+
+    if kv_cache_dtype is not None:
+        valid_cache_dtypes = get_args(CacheDType)
+        assert kv_cache_dtype in valid_cache_dtypes, (
+            f"Invalid kv_cache_dtype: {kv_cache_dtype}. "
+            f"Valid values are: {valid_cache_dtypes}"
+        )
+
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,
-        kv_cache_dtype=kv_cache_dtype,
+        kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
         block_size=block_size,
         use_mla=use_mla,
         has_sink=has_sink,
@@ -149,8 +101,8 @@ def get_attn_backend(
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: str | None,
-    block_size: int,
+    kv_cache_dtype: CacheDType | None,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
@@ -161,7 +113,9 @@ def _cached_get_attn_backend(
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
     selected_backend = None
-    backend_by_global_setting: _Backend | None = get_global_forced_attn_backend()
+    backend_by_global_setting: AttentionBackendEnum | None = (
+        get_global_forced_attn_backend()
+    )
     if backend_by_global_setting is not None:
         selected_backend = backend_by_global_setting
     else:
@@ -177,37 +131,70 @@ def _cached_get_attn_backend(
                     STR_BACKEND_ENV_VAR,
                 )
                 backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-            if selected_backend is None:
+            try:
+                selected_backend = AttentionBackendEnum[backend_by_env_var]
+            except KeyError as e:
                 raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. "
-                    f"Valid backends are: {list(_Backend.__members__.keys())}"
-                )
+                    f"Invalid attention backend: '{backend_by_env_var}'. Valid "
+                    f"backends are: {list(AttentionBackendEnum.__members__.keys())}"
+                ) from e
 
     # get device-specific attn_backend
     from vllm.platforms import current_platform
 
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        True,
-        use_mla,
-        has_sink,
-        use_sparse,
-    )
+    sig = inspect.signature(current_platform.get_attn_backend_cls)
+    if "use_v1" in sig.parameters:
+        logger.warning_once(
+            "use_v1 parameter for get_attn_backend_cls is deprecated and will "
+            "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
+            "remove it from your plugin code."
+        )
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            True,  # use_v1
+            use_mla,
+            has_sink,
+            use_sparse,
+        )
+    else:
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+        )
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}"
         )
-    return resolve_obj_by_qualname(attention_cls)
+    backend = resolve_obj_by_qualname(attention_cls)
+
+    # Adjust kv cache layout if the selected backend requires a specific one
+    required_layout = backend.get_required_kv_cache_layout()
+    if required_layout is not None:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout(required_layout)
+        logger.info(
+            "Using %s KV cache layout for %s backend.",
+            required_layout,
+            backend.get_name(),
+        )
+
+    return backend
 
 
 @contextmanager
 def global_force_attn_backend_context_manager(
-    attn_backend: _Backend,
+    attn_backend: AttentionBackendEnum,
 ) -> Generator[None, None, None]:
     """
     Globally force a vLLM attention backend override within a
diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/attention/utils/kv_transfer_utils.py
new file mode 100644
index 0000000000000..210be55feb2fa
--- /dev/null
+++ b/vllm/attention/utils/kv_transfer_utils.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+from collections.abc import Callable
+from functools import wraps
+
+from vllm.distributed.kv_transfer import (
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+    is_v1_kv_transfer_group,
+)
+
+
+def maybe_transfer_kv_layer(func: Callable) -> Callable:
+    """Decorator that handles KV layer transfer prior and after execution of
+    an attention layer, if enabled. Otherwise, the wrapper is a no-op.
+
+    On entry: waits for the KV layer from the connector.
+    On exit: saves the KV layer to the connector.
+    """
+    # Import at runtime to avoid circular dependency
+    from vllm.attention.layer import get_attention_context
+
+    # Inspect the signature ONCE when the decorator is applied.
+    sig = inspect.signature(func)
+    param_names = list(sig.parameters.keys())
+
+    # Find the index of 'layer_name' parameter.
+    try:
+        layer_name_index = param_names.index("layer_name")
+    except ValueError as e:
+        raise TypeError(
+            f"Function {func.__name__} must have a 'layer_name' parameter"
+        ) from e
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+            return func(*args, **kwargs)
+
+        layer_name: str = args[layer_name_index]
+
+        # Extract attention context (layer-specific metadata, layer, and kv_cache)
+        attn_metadata, attn_layer, kv_cache = get_attention_context(layer_name)
+        connector = get_kv_transfer_group()
+        if attn_metadata is None or not connector.has_connector_metadata():
+            return func(*args, **kwargs)
+
+        # Wait for KV layer on entry
+        connector.wait_for_layer_load(layer_name)
+
+        # Execute the function
+        result = func(*args, **kwargs)
+
+        # Save KV cache layer on exit
+        connector.save_kv_layer(layer_name, kv_cache, attn_metadata)
+
+        return result
+
+    return wrapper
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 29462d9ff0e50..126ad35e527ae 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -132,6 +132,23 @@ class FixFunctionalizationPass(VllmInductorPass):
                         "input_global_scale",
                     ),
                 )
+            # Defunctionalize fused_qk_norm_rope to remove higher-order wrapper.
+            elif at_target == torch.ops._C.fused_qk_norm_rope.default:
+                mutated_args = {1: "qkv"}
+                args = (
+                    "qkv",
+                    "num_heads_q",
+                    "num_heads_k",
+                    "num_heads_v",
+                    "head_dim",
+                    "eps",
+                    "q_weight",
+                    "k_weight",
+                    "cos_sin_cache",
+                    "is_neox",
+                    "position_ids",
+                )
+                self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
             else:
                 continue  # skip the count
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 8f0ad2d69fbec..1d6e297b495eb 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -44,6 +44,10 @@ def empty_i32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
 
 
+def empty_i64(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda")
+
+
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 383fe6033a6df..38eb4e5301a18 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -18,10 +18,13 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     kFp8StaticTensorSym,
     kNvfp4Quant,
 )
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
 
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+ROTARY_OP = torch.ops._C.rotary_embedding.default
+FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
 
 QUANT_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
@@ -58,6 +61,9 @@ class MatcherCustomOp(ABC):
     def empty(self, *args, **kws):
         return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws)
 
+    def empty_int64(self, *args, **kws):
+        return torch.empty(*args, dtype=torch.int64, device=self.device, **kws)
+
     def empty_f32(self, *args, **kws):
         return torch.empty(*args, dtype=torch.float32, device=self.device, **kws)
 
@@ -66,6 +72,77 @@ class MatcherCustomOp(ABC):
         raise NotImplementedError
 
 
+class MatcherRotaryEmbedding(MatcherCustomOp):
+    def __init__(
+        self,
+        is_neox: bool,
+        head_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        use_flashinfer: bool = False,
+        enabled: bool | None = None,
+    ) -> None:
+        if enabled is None:
+            enabled = RotaryEmbedding.enabled()
+
+        super().__init__(enabled)
+        self.is_neox = is_neox
+        self.head_size = head_size
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.q_size = self.num_heads * self.head_size
+        self.kv_size = self.num_kv_heads * self.head_size
+        self.rotary_dim = head_size
+        if use_flashinfer:
+            self.rotary_op = FLASHINFER_ROTARY_OP
+        else:
+            self.rotary_op = ROTARY_OP
+
+    def inputs(self) -> list[torch.Tensor]:
+        positions = self.empty_int64(5)
+        query = self.empty(5, self.q_size)
+        key = self.empty(5, self.kv_size)
+        cos_sin_cache = self.empty(4096, self.rotary_dim)
+        return [positions, query, key, cos_sin_cache]
+
+    def forward_custom(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        result = auto_functionalized(
+            self.rotary_op,
+            positions=positions,
+            query=query,
+            key=key,
+            head_size=self.head_size,
+            cos_sin_cache=cos_sin_cache,
+            is_neox=self.is_neox,
+        )
+        query_out = result[1]
+        key_out = result[2] if len(result) > 2 else None
+        return query_out, key_out
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return RotaryEmbedding.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            cos_sin_cache,
+            self.is_neox,
+        )
+
+
 class MatcherRMSNorm(MatcherCustomOp):
     def __init__(self, epsilon: float, enabled: bool | None = None):
         if enabled is None:
@@ -85,10 +162,12 @@ class MatcherRMSNorm(MatcherCustomOp):
         weight: torch.Tensor,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
+        # TODO: support non-contiguous input for RMSNorm and remove this
+        input_contiguous = input.contiguous()
         _, result = auto_functionalized(
             RMS_OP,
             result=result,
-            input=input,
+            input=input_contiguous,
             weight=weight,
             epsilon=self.epsilon,
         )
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index dfda2adf1d3b0..0c2210d72ce07 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -17,6 +17,7 @@ if current_platform.is_cuda_alike():
     from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import RMSNormQuantFusionPass
     from .fusion_attn import AttnFusionPass
+    from .qk_norm_rope_fusion import QKNormRoPEFusionPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
@@ -109,6 +110,9 @@ class PostGradPassManager(CustomGraphPass):
             if self.pass_config.enable_attn_fusion:
                 self.passes += [AttnFusionPass(config)]
 
+            if self.pass_config.enable_qk_norm_rope_fusion:
+                self.passes += [QKNormRoPEFusionPass(config)]
+
             # needs a functional graph
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
new file mode 100644
index 0000000000000..e3c399e079063
--- /dev/null
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.attention import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+from .fusion import empty_bf16, empty_fp32, empty_i64
+from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherRMSNorm, MatcherRotaryEmbedding
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+
+FUSED_QK_ROPE_OP = torch.ops._C.fused_qk_norm_rope.default
+
+
+class QkNormRopePattern:
+    """
+    Match the unfused sequence in attention blocks and replace with the fused op.
+
+    Unfused (conceptually):
+      q, k, v = split(qkv, [qsz, kvsz, kvsz], -1)
+      qh = reshape(q, [-1, num_heads, head_dim])
+      kh = reshape(k, [-1, num_kv_heads, head_dim])
+      qn = rms_norm(qh, q_weight, eps)
+      kn = rms_norm(kh, k_weight, eps)
+      qf = reshape(qn, [-1, num_heads * head_dim])
+      kf = reshape(kn, [-1, num_kv_heads * head_dim])
+      qf, kf = rotary_embedding(positions, qf, kf, head_dim, cos_sin_cache, is_neox)
+      return qf, kf, v
+
+    Fused replacement:
+      fused_qk_norm_rope(qkv, num_heads, num_kv_heads, num_kv_heads, head_dim,
+                         eps, q_weight, k_weight, cos_sin_cache, is_neox,
+                         positions.view(-1))
+      return split(qkv, [qsz, kvsz, kvsz], -1)
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        eps: float,
+        is_neox: bool,
+        rope_flashinfer: bool = False,
+    ) -> None:
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.eps = eps
+        self.rmsnorm_matcher = MatcherRMSNorm(eps)
+        self.is_neox = is_neox
+        self.rope_flashinfer = rope_flashinfer
+        self.rope_matcher = MatcherRotaryEmbedding(
+            is_neox=is_neox,
+            head_size=self.head_dim,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            use_flashinfer=self.rope_flashinfer,
+        )
+
+    def get_inputs(self):
+        # Sample inputs to help pattern tracing
+        T = 5
+        qkv = empty_bf16(T, self.q_size + 2 * self.kv_size)
+        positions = empty_i64(T)
+        q_weight = empty_bf16(1, self.head_dim)
+        k_weight = empty_bf16(1, self.head_dim)
+        if self.rope_flashinfer:
+            cos_sin_cache = empty_fp32(4096, self.head_dim)
+        else:
+            cos_sin_cache = empty_bf16(4096, self.head_dim)
+        return [
+            qkv,
+            positions,
+            q_weight,
+            k_weight,
+            cos_sin_cache,
+        ]
+
+    @staticmethod
+    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
+        def wrapped(*args, **kwargs):
+            gm = trace_fn(*args, **kwargs)
+            for process_fx in process_fx_fns:
+                process_fx(gm)
+
+            return gm
+
+        return wrapped
+
+    @staticmethod
+    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+        from torch._inductor.fx_passes.post_grad import view_to_reshape
+
+        view_to_reshape(gm)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # split qkv -> q,k,v
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # Q path: view -> RMS -> view back to q.shape
+            q_by_head = q.view(
+                *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim
+            )
+            q_normed_by_head = self.rmsnorm_matcher(q_by_head, q_weight)
+            q_flat = q_normed_by_head.view(q.shape)
+
+            # K path: view -> RMS -> view back to k.shape
+            k_by_head = k.view(
+                *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim
+            )
+            k_normed_by_head = self.rmsnorm_matcher(k_by_head, k_weight)
+            k_flat = k_normed_by_head.view(k.shape)
+
+            # RoPE: apply to flattened q/k
+            q_rope, k_rope = self.rope_matcher(positions, q_flat, k_flat, cos_sin_cache)
+            return q_rope, k_rope, v
+
+        def replacement(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # Run fused qk_norm_rope op
+            result = auto_functionalized(
+                FUSED_QK_ROPE_OP,
+                qkv=qkv,
+                num_heads_q=self.num_heads,
+                num_heads_k=self.num_kv_heads,
+                num_heads_v=self.num_kv_heads,
+                head_dim=self.head_dim,
+                eps=self.eps,
+                q_weight=q_weight,
+                k_weight=k_weight,
+                cos_sin_cache=cos_sin_cache,
+                is_neox=self.is_neox,
+                position_ids=positions.view(-1),
+            )
+            result_qkv = result[1]
+
+            # Split back to q,k,v and return
+            return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # NOTE: use fx_view_to_reshape to unify view/reshape to simplify
+        # pattern and increase matching opportunities
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.get_inputs(),
+            QkNormRopePattern.wrap_trace_fn(
+                pm.fwd_only,
+                QkNormRopePattern.fx_view_to_reshape,
+            ),
+            pm_pass,
+        )
+
+
+class QKNormRoPEFusionPass(VllmPatternMatcherPass):
+    """Fuse Q/K RMSNorm + RoPE into fused_qk_norm_rope when the custom op exists."""
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="qk_norm_rope_fusion_pass"
+        )
+
+        dtype = config.model_config.dtype
+        if dtype not in (torch.bfloat16, torch.float16):
+            logger.warning_once(
+                "QK Norm+RoPE fusion not enabled: unsupported dtype %s", dtype
+            )
+            return
+
+        # use one attn layer to get meta (such as head_dim) for QkNormRopePattern
+        attn_layers: dict[str, Attention] = get_layers_from_vllm_config(
+            config, Attention
+        )
+        if len(attn_layers) == 0:
+            logger.warning_once(
+                "QK Norm+RoPE fusion enabled, but no Attention layers were discovered."
+            )
+            return
+        layer = next(iter(attn_layers.values()))
+
+        for epsilon in [1e-5, 1e-6]:
+            for neox in [True, False]:
+                if RotaryEmbedding.enabled():
+                    for rope_flashinfer in [False, True]:
+                        QkNormRopePattern(
+                            head_dim=layer.head_size,
+                            num_heads=layer.num_heads,
+                            num_kv_heads=layer.num_kv_heads,
+                            eps=epsilon,
+                            is_neox=neox,
+                            rope_flashinfer=rope_flashinfer,
+                        ).register(self.patterns)
+                else:
+                    QkNormRopePattern(
+                        head_dim=layer.head_size,
+                        num_heads=layer.num_heads,
+                        num_kv_heads=layer.num_kv_heads,
+                        eps=epsilon,
+                        is_neox=neox,
+                    ).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Fused QK Norm+RoPE on %s sites", self.matched_count)
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, QkNormRopePattern)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 4b10c85209f63..4d26619bd128c 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -31,7 +31,9 @@ class TorchCompileWrapperWithCustomDispatcher:
     """
 
     def __init__(
-        self, compiled_callable: Callable | None = None, compilation_mode: int = 0
+        self,
+        compiled_callable: Callable | None = None,
+        compilation_mode: CompilationMode = CompilationMode.NONE,
     ):
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 7f1cc52024205..dd76a722106ef 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -9,6 +9,7 @@ from vllm.config.compilation import (
     PassConfig,
 )
 from vllm.config.device import DeviceConfig
+from vllm.config.ec_transfer import ECTransferConfig
 from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
@@ -54,6 +55,8 @@ __all__ = [
     "PassConfig",
     # From vllm.config.device
     "DeviceConfig",
+    # From vllm.config.ec_transfer
+    "ECTransferConfig",
     # From vllm.config.kv_events
     "KVEventsConfig",
     # From vllm.config.kv_transfer
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 031df3091f1c6..864cf1be81b20 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -21,7 +21,15 @@ else:
 logger = init_logger(__name__)
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
-CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+CacheDType = Literal[
+    "auto",
+    "bfloat16",
+    "fp8",
+    "fp8_e4m3",
+    "fp8_e5m2",
+    "fp8_inc",
+    "fp8_ds_mla",
+]
 MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 92cf16f259fe7..b0d1bc2bab306 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -28,7 +28,7 @@ else:
 logger = init_logger(__name__)
 
 
-class CompilationMode:
+class CompilationMode(enum.IntEnum):
     """The compilation approach used for torch.compile-based compilation of the
     model."""
 
@@ -115,7 +115,7 @@ class PassConfig:
     """The threshold of the communicated tensor sizes under which
     vllm should use flashinfer fused allreduce. Specified as a
     float in MB.
-    Unspecified will fallback to default values 
+    Unspecified will fallback to default values
     which are compute capability and world size dependent.
         FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
             90: {
@@ -129,6 +129,8 @@ class PassConfig:
                 8: 1,  # 1MB
             },
         }, where key is the device capability"""
+    enable_qk_norm_rope_fusion: bool = False
+    """Whether to enable the fused Q/K RMSNorm + RoPE pass."""
 
     # TODO(luka) better pass enabling system.
 
@@ -182,6 +184,12 @@ class PassConfig:
                     "Fusion enabled but reshape elimination disabled. "
                     "Allreduce + rms norm + quant (fp8) fusion might not work"
                 )
+        if self.enable_qk_norm_rope_fusion and not current_platform.is_cuda_alike():
+            logger.warning_once(
+                "QK Norm + RoPE fusion enabled but the current platform is not "
+                "CUDA or ROCm. The fusion will be disabled."
+            )
+            self.enable_qk_norm_rope_fusion = False
 
 
 @config
@@ -198,7 +206,6 @@ class CompilationConfig:
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
         - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
     - CudaGraph capture:
-        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
         - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
@@ -208,7 +215,6 @@ class CompilationConfig:
         [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
-        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
     - Inductor compilation:
         - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
@@ -236,7 +242,7 @@ class CompilationConfig:
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: int | None = None
+    mode: CompilationMode | None = None
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -369,37 +375,25 @@ class CompilationConfig:
     FULL mode: Capture full cudagraph for all batches. Can be good for small
     models or workloads with small prompts; not supported by many backends.
     Generally for performance FULL_AND_PIECEWISE is better.
-    
+
     FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
     Mixed prefill-decode batches are run without cudagraphs. Can be good for
     decode instances in a P/D setup where prefill is not as important so we
     can save some memory.
-    
+
     FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
     piecewise cudagraph for prefill and mixed prefill-decode batches.
     This is the most performant mode for most models and is the default.
 
     Currently, the cudagraph mode is only used for the v1 engine.
-    Note that the cudagraph logic is generally orthogonal to the 
-    compilation logic. While piecewise cudagraphs require piecewise 
+    Note that the cudagraph logic is generally orthogonal to the
+    compilation logic. While piecewise cudagraphs require piecewise
     compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
     cudagraphs are supported with and without compilation.
-    
-    Warning: This flag is new and subject to change in addition 
+
+    Warning: This flag is new and subject to change in addition
     more modes may be added.
     """
-    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation:
-
-    - False: cudagraph inside compilation is not used.\n
-    - True: cudagraph inside compilation is used. It requires
-        that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
-    _PIECEWISE instead.
-    """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
     It means the first several runs will be treated as warmup runs.
@@ -414,18 +408,9 @@ class CompilationConfig:
     cudagraph. If the caller can guarantee that the same input buffers
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False. 
+    internally managed buffer. Default is False.
     Note that this flag is only effective when cudagraph_mode is PIECEWISE.
     """
-    full_cuda_graph: bool | None = False
-    """whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs. Thus this
-    flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models.
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
-    FULL_AND_PIECEWISE instead.
-    """
     cudagraph_specialize_lora: bool = True
     """Whether to create separate cuda graphs for cases with and without active
     LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
@@ -443,7 +428,7 @@ class CompilationConfig:
     outside the partition functions. For a graph with N cudagraph-unsafe ops
     (e.g., Attention), there would be N+1 partitions. To mark an op as
     cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
-    register the custom op. 
+    register the custom op.
 
     This config supports both full cudagraph and piecewise cudagraph without
     compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
@@ -460,8 +445,8 @@ class CompilationConfig:
 
     max_cudagraph_capture_size: int | None = field(default=None)
     """The maximum cudagraph capture size.
-    
-    If cudagraph_capture_sizes is specified, this will be set to the largest 
+
+    If cudagraph_capture_sizes is specified, this will be set to the largest
     size in that list (or checked for consistency if specified). If
     cudagraph_capture_sizes is not specified, the list of sizes is generated
     automatically following the pattern:
@@ -470,7 +455,7 @@ class CompilationConfig:
         range(256, max_cudagraph_capture_size + 1, 16))
 
     If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
-    512) by default. This voids OOM in tight memory scenarios with small 
+    512) by default. This voids OOM in tight memory scenarios with small
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
@@ -571,16 +556,43 @@ class CompilationConfig:
 
     __str__ = __repr__
 
+    @field_validator("mode", mode="before")
+    @classmethod
+    def validate_mode_before(cls, value: Any) -> Any:
+        """
+        Enable parsing the `mode` field from string mode names.
+        Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
+        DYNAMO_TRACE_ONCE, VLLM_COMPILE.
+        """
+        if isinstance(value, str):
+            # Convert string mode name to integer value
+            mode_name = value.upper()
+
+            if mode_name not in CompilationMode.__members__:
+                raise ValueError(
+                    f"Invalid compilation mode: {value}. "
+                    f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
+                )
+
+            return CompilationMode[mode_name]
+        return value
+
     @field_validator("cudagraph_mode", mode="before")
     @classmethod
     def validate_cudagraph_mode_before(cls, value: Any) -> Any:
-        """
-        enable parse the `cudagraph_mode` enum type from string
-        """
+        """Enable parsing of the `cudagraph_mode` enum type from string."""
         if isinstance(value, str):
             return CUDAGraphMode[value.upper()]
         return value
 
+    @field_validator("pass_config", mode="before")
+    @classmethod
+    def validate_pass_config_before(cls, value: Any) -> Any:
+        """Enable parsing of the `pass_config` field from a dictionary."""
+        if isinstance(value, dict):
+            return PassConfig(**value)
+        return value
+
     @field_validator("compile_cache_save_format")
     @classmethod
     def validate_compile_cache_save_format(cls, value: str) -> str:
@@ -637,8 +649,10 @@ class CompilationConfig:
                 func if isinstance(func, InductorPass) else CallableInductorPass(func)
             )
 
-        if isinstance(self.pass_config, dict):
-            self.pass_config = PassConfig(**self.pass_config)
+        if self.pass_config.enable_qk_norm_rope_fusion:
+            # TODO(zhuhaoran): support rope native forward match and remove this.
+            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
+            self.custom_ops.append("+rotary_embedding")
 
         if (
             is_torch_equal_or_newer("2.9.0.dev")
@@ -650,36 +664,6 @@ class CompilationConfig:
             self.inductor_compile_config["combo_kernels"] = True
             self.inductor_compile_config["benchmark_combo_kernel"] = True
 
-        # migrate the deprecated flags
-        if not self.use_cudagraph:
-            logger.warning(
-                "use_cudagraph is deprecated, use cudagraph_mode=NONE instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and self.cudagraph_mode != CUDAGraphMode.NONE
-            ):
-                raise ValueError(
-                    "use_cudagraph and cudagraph_mode are mutually"
-                    " exclusive, prefer cudagraph_mode since "
-                    "use_cudagraph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.NONE
-        if self.full_cuda_graph:
-            logger.warning(
-                "full_cuda_graph is deprecated, use cudagraph_mode=FULL instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and not self.cudagraph_mode.has_full_cudagraphs()
-            ):
-                raise ValueError(
-                    "full_cuda_graph and cudagraph_mode are "
-                    "mutually exclusive, prefer cudagraph_mode "
-                    "since full_cuda_graph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.FULL
-
         if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
             "2.9.0.dev"
         ):
@@ -857,20 +841,19 @@ class CompilationConfig:
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.enable_attn_fusion
-        # For dynamo-partition (non-inductor) attention fusion,
-        # set splitting_ops to empty to avoid splitting at attention ops
-        self.splitting_ops = []
-        if self.cudagraph_mode.has_piecewise_cudagraphs():
-            logger.warning_once(
-                "enable_attn_fusion is incompatible with piecewise "
-                "cudagraph when use_inductor_graph_partition is off. "
-                "In this case, splitting_ops will be set to empty "
-                "list, and cudagraph_mode will be set to FULL. "
-                "Please ensure you are using attention backends that "
-                "support cudagraph or set cudagraph_mode to NONE "
-                "explicitly if encountering any problems."
-            )
-            self.cudagraph_mode = CUDAGraphMode.FULL
+        if self.splitting_ops is None:
+            self.splitting_ops = []
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
+                logger.warning_once(
+                    "enable_attn_fusion is incompatible with piecewise "
+                    "cudagraph when use_inductor_graph_partition is off. "
+                    "In this case, splitting_ops will be set to empty "
+                    "list, and cudagraph_mode will be set to FULL. "
+                    "Please ensure you are using attention backends that "
+                    "support cudagraph or set cudagraph_mode to NONE "
+                    "explicitly if encountering any problems."
+                )
+                self.cudagraph_mode = CUDAGraphMode.FULL
 
         assert not self.splitting_ops_contain_attention(), (
             "attention ops should not be in splitting_ops "
@@ -891,7 +874,7 @@ class CompilationConfig:
             return self.mode == CompilationMode.VLLM_COMPILE
 
         # Inductor partition case
-        return self.backend == "inductor" and self.mode > CompilationMode.NONE
+        return self.backend == "inductor" and self.mode != CompilationMode.NONE
 
     def custom_op_log_check(self):
         """
diff --git a/vllm/config/ec_transfer.py b/vllm/config/ec_transfer.py
new file mode 100644
index 0000000000000..d95236f818abb
--- /dev/null
+++ b/vllm/config/ec_transfer.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import uuid
+from dataclasses import field
+from typing import Any, Literal, get_args
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+ECProducer = Literal["ec_producer"]
+ECConsumer = Literal["ec_consumer"]
+ECRole = Literal[ECProducer, ECConsumer]
+
+
+@config
+@dataclass
+class ECTransferConfig:
+    """Configuration for distributed EC cache transfer."""
+
+    ec_connector: str | None = None
+    """The EC connector for vLLM to transmit EC caches between vLLM instances.
+    """
+
+    engine_id: str | None = None
+    """The engine id for EC transfers."""
+
+    ec_buffer_device: str | None = "cuda"
+    """The device used by ec connector to buffer the EC cache.
+    Currently only support 'cuda'."""
+
+    ec_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+
+    ec_role: ECRole | None = None
+    """Whether this vLLM instance produces, consumes EC cache, or both. Choices
+    are 'ec_producer', 'ec_consumer'."""
+
+    ec_rank: int | None = None
+    """The rank of this vLLM instance in the EC cache transfer. Typical value:
+    0 for encoder, 1 for pd instance.
+    Currently only 1P1D is supported."""
+
+    ec_parallel_size: int = 1
+    """The number of parallel instances for EC cache transfer. For
+    PyNcclConnector, this should be 2."""
+
+    ec_ip: str = "127.0.0.1"
+    """The EC connector ip, used to build distributed connection."""
+
+    ec_port: int = 14579
+    """The EC connector port, used to build distributed connection."""
+
+    ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+
+    ec_connector_module_path: str | None = None
+    """The Python module path to dynamically load the EC connector from.
+    Only supported in V1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
+        if self.ec_role is not None and self.ec_role not in get_args(ECRole):
+            raise ValueError(
+                f"Unsupported ec_role: {self.ec_role}. "
+                f"Supported roles are {get_args(ECRole)}"
+            )
+
+        if self.ec_connector is not None and self.ec_role is None:
+            raise ValueError(
+                "Please specify ec_role when ec_connector "
+                f"is set, supported roles are {get_args(ECRole)}"
+            )
+
+    @property
+    def is_ec_transfer_instance(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECRole)
+
+    @property
+    def is_ec_producer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
+
+    @property
+    def is_ec_consumer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
+
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.ec_connector_extra_config.get(key, default)
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 44c044c76168d..c47b619118ff2 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -45,7 +45,7 @@ if TYPE_CHECKING:
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
 else:
     PretrainedConfig = Any
 
-    _Backend = Any
+    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
@@ -264,7 +264,8 @@ class ModelConfig:
     merged with the default config from the model. If used with
     `--generation-config vllm`, only the override parameters are used."""
     enable_sleep_mode: bool = False
-    """Enable sleep mode for the engine (only cuda platform is supported)."""
+    """Enable sleep mode for the engine (only cuda and
+    hip platforms are supported)."""
     model_impl: str | ModelImpl = "auto"
     """Which implementation of the model to use:\n
     - "auto" will try to use the vLLM implementation, if it exists, and fall
@@ -302,7 +303,7 @@ class ModelConfig:
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
     mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
     mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
-    mm_encoder_attn_backend: InitVar[_Backend | str | None] = None
+    mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None
     interleave_mm_strings: InitVar[bool | None] = None
     skip_mm_profiling: InitVar[bool | None] = None
     video_pruning_rate: InitVar[float | None] = None
@@ -420,7 +421,7 @@ class ModelConfig:
         mm_processor_cache_type: MMCacheType | None,
         mm_shm_cache_max_object_size_mb: int | None,
         mm_encoder_tp_mode: MMEncoderTPMode | None,
-        mm_encoder_attn_backend: _Backend | str | None,
+        mm_encoder_attn_backend: AttentionBackendEnum | str | None,
         interleave_mm_strings: bool | None,
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
@@ -1619,6 +1620,13 @@ class ModelConfig:
 
     @property
     def is_hybrid(self) -> bool:
+        # Handle granite-4.0-micro case which uses hybrid config but does not
+        # actually contain any non-attention layers.
+        layer_types = getattr(self.hf_config, "layer_types", None)
+        if layer_types is not None and all(
+            layer == "attention" for layer in layer_types
+        ):
+            return False
         return self._model_info.is_hybrid
 
     @property
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index ef73720efe099..9f62b35ed515c 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -11,9 +11,9 @@ from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
 else:
-    _Backend = Any
+    AttentionBackendEnum = Any
 
 
 @dataclass
@@ -125,10 +125,10 @@ class MultiModalConfig:
         DP (which is controlled by `--data-parallel-size`).
         This is only supported on a per-model basis and falls back to
         `"weights"` if the encoder does not support DP."""
-    mm_encoder_attn_backend: _Backend | None = None
+    mm_encoder_attn_backend: AttentionBackendEnum | None = None
     """Optional override for the multi-modal encoder attention backend when
     using vision transformers. Accepts any value from
-    `vllm.attention.backends.registry._Backend` (e.g. `FLASH_ATTN`)."""
+    `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string."""
@@ -167,26 +167,19 @@ class MultiModalConfig:
 
     @field_validator("mm_encoder_attn_backend", mode="before")
     @classmethod
-    def _validate_mm_encoder_attn_backend(cls, value: object) -> _Backend | None:
-        from vllm.attention.backends.registry import (
-            _Backend as BackendEnum,
-        )
-        from vllm.attention.backends.registry import (
-            backend_name_to_enum,
-        )
+    def _validate_mm_encoder_attn_backend(
+        cls, value: str | AttentionBackendEnum | None
+    ) -> AttentionBackendEnum | None:
+        # We need to import the real type here (deferred to avoid circular import).
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        if value is None or isinstance(value, BackendEnum):
+        if value is None or isinstance(value, AttentionBackendEnum):
             return value
 
-        if isinstance(value, str):
-            candidate = backend_name_to_enum(value.upper())
-            if candidate is not None:
-                return candidate
-
-        valid_backends = ", ".join(sorted(BackendEnum.__members__.keys()))
-        raise ValueError(
-            f"Invalid mm encoder attention backend. Expected one of: {valid_backends}."
+        assert isinstance(value, str), (
+            "mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
         )
+        return AttentionBackendEnum[value.upper()]
 
     @model_validator(mode="after")
     def _validate_multimodal_config(self):
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index b19c8beeae3df..61bcd15e06a84 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -278,10 +278,10 @@ class ParallelConfig:
             )
 
         if self.enable_eplb:
-            if not current_platform.is_cuda():
+            if not current_platform.is_cuda_alike():
                 raise ValueError(
                     "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now."
+                    "CUDA devices or ROCm devices now."
                 )
             if not self.enable_expert_parallel:
                 raise ValueError("enable_expert_parallel must be True to use EPLB.")
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 0fca967d90838..f581267f73f7d 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -28,6 +28,7 @@ from vllm.utils import random_uuid
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
+from .ec_transfer import ECTransferConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
 from .load import LoadConfig
@@ -103,6 +104,8 @@ class VllmConfig:
     """The configurations for distributed KV cache transfer."""
     kv_events_config: KVEventsConfig | None = None
     """The configurations for event publishing."""
+    ec_transfer_config: ECTransferConfig | None = None
+    """The configurations for distributed EC cache transfer."""
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.
@@ -183,6 +186,10 @@ class VllmConfig:
             vllm_factors.append(self.kv_transfer_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.ec_transfer_config:
+            vllm_factors.append(self.ec_transfer_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.additional_config:
             if isinstance(additional_config := self.additional_config, dict):
                 additional_config_hash = hashlib.md5(
@@ -422,16 +429,13 @@ class VllmConfig:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
-        else:
-            assert self.compilation_config.mode >= CompilationMode.NONE
-            assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE
 
         # If user does not set custom ops via none or all set it here based on
         # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
-                and self.compilation_config.mode > CompilationMode.NONE
+                and self.compilation_config.mode != CompilationMode.NONE
             ):
                 self.compilation_config.custom_ops.append("none")
             else:
@@ -652,14 +656,6 @@ class VllmConfig:
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
 
-            # final migrate the deprecated flags
-            self.compilation_config.use_cudagraph = (
-                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            )
-            self.compilation_config.full_cuda_graph = (
-                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            )
-
         if self.parallel_config.enable_dbo:
             a2a_backend = self.parallel_config.all2all_backend
             assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (
@@ -849,7 +845,9 @@ class VllmConfig:
                 )
                 # de-duplicate the sizes provided by the config
                 dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
-                cudagraph_capture_sizes = dedup_sizes
+                cudagraph_capture_sizes = [
+                    i for i in dedup_sizes if i <= max_num_tokens
+                ]
                 # sort to make sure the sizes are in ascending order
                 cudagraph_capture_sizes.sort()
             else:
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 5e3dbde393be3..e9695698bb493 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -63,7 +63,7 @@ try:
     libcudart = CudaRTLibrary()
     cumem_available = True
 except ModuleNotFoundError:
-    # rocm platform does not support cumem allocator
+    # only cuda and rocm platforms support cumem allocator
     init_module = None
     python_create_and_map = None
     python_unmap_and_release = None
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 07ab2f7124091..6aadab33e3132 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -14,6 +14,7 @@ import torch  # noqa
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -105,6 +106,20 @@ class CudaRTLibrary:
         ),
     ]
 
+    # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa
+    cuda_to_hip_mapping = {
+        "cudaSetDevice": "hipSetDevice",
+        "cudaDeviceSynchronize": "hipDeviceSynchronize",
+        "cudaDeviceReset": "hipDeviceReset",
+        "cudaGetErrorString": "hipGetErrorString",
+        "cudaMalloc": "hipMalloc",
+        "cudaFree": "hipFree",
+        "cudaMemset": "hipMemset",
+        "cudaMemcpy": "hipMemcpy",
+        "cudaIpcGetMemHandle": "hipIpcGetMemHandle",
+        "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle",
+    }
+
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: dict[str, Any] = {}
@@ -117,7 +132,13 @@ class CudaRTLibrary:
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
-                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+                # libcudart is not loaded in the current process, try hip
+                so_file = find_loaded_library("libamdhip64")
+                # should be safe to assume now that we are using ROCm
+                # as the following assertion should error out if the
+                # libhiprtc library is also not loaded
+                if so_file is None:
+                    so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
             assert so_file is not None, (
                 "libcudart is not loaded in the current process, "
                 "try setting VLLM_CUDART_SO_PATH"
@@ -130,7 +151,12 @@ class CudaRTLibrary:
         if so_file not in CudaRTLibrary.path_to_dict_mapping:
             _funcs = {}
             for func in CudaRTLibrary.exported_functions:
-                f = getattr(self.lib, func.name)
+                f = getattr(
+                    self.lib,
+                    CudaRTLibrary.cuda_to_hip_mapping[func.name]
+                    if current_platform.is_rocm()
+                    else func.name,
+                )
                 f.restype = func.restype
                 f.argtypes = func.argtypes
                 _funcs[func.name] = f
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 74d6fb40c83b7..eb1f173b11925 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -88,13 +88,21 @@ class SymmMemCommunicator:
             self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
                 self.world_size
             ]
-
-        self.buffer = torch_symm_mem.empty(
-            self.max_size // self.dtype.itemsize,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        try:
+            self.buffer = torch_symm_mem.empty(
+                self.max_size // self.dtype.itemsize,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        except RuntimeError as e:
+            logger.warning_once(
+                "SymmMemCommunicator: symmetric memory initialization failed: %s "
+                "Communicator is not available. To suppress this warning set "
+                "VLLM_ALLREDUCE_USE_SYMM_MEM=0",
+                str(e),
+            )
+            return
         if handle.multicast_ptr == 0:
             logger.warning(
                 "SymmMemCommunicator: symmetric memory "
diff --git a/vllm/distributed/ec_transfer/__init__.py b/vllm/distributed/ec_transfer/__init__.py
new file mode 100644
index 0000000000000..0decfd143e343
--- /dev/null
+++ b/vllm/distributed/ec_transfer/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.distributed.ec_transfer.ec_transfer_state import (
+    ensure_ec_transfer_initialized,
+    get_ec_transfer,
+    has_ec_transfer,
+)
+
+__all__ = [
+    "get_ec_transfer",
+    "ensure_ec_transfer_initialized",
+    "has_ec_transfer",
+]
diff --git a/vllm/distributed/ec_transfer/ec_connector/__init__.py b/vllm/distributed/ec_transfer/ec_connector/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/ec_transfer/ec_connector/base.py b/vllm/distributed/ec_transfer/ec_connector/base.py
new file mode 100644
index 0000000000000..2b7b14d89b8a1
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/base.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ECConnectorBase Class for Distributed Encoder Cache &
+P2P Encoder cache communication in V1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save Encoder cache.
+        check_caches_exist() - Check whether Encoder cache of requests exist
+        update_state_after_alloc() - update ECConnector state after
+        allocate. This will decide to load the cache or not
+        request_finished() - called when a request is finished,
+        free the cache with the requests
+
+    Worker-side: runs in each worker, loads/saves Encoder Cache to/from
+    the Connector based on the metadata.
+        start_load_ec() - starts loading all ECs (maybe async)
+        wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class ECConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+class ECConnectorMetadata(ABC):  # noqa: B024
+    """
+    Abstract Metadata used to communicate between the
+    Scheduler ECConnector and Worker ECConnector.
+    """
+
+    pass
+
+
+class ECConnectorBase(ABC):
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        self._connector_metadata: ECConnectorMetadata | None = None
+        self._vllm_config = vllm_config
+        self._role = role
+        if vllm_config.ec_transfer_config is not None:
+            self._is_producer = vllm_config.ec_transfer_config.is_ec_producer
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    @property
+    def role(self) -> ECConnectorRole:
+        return self._role
+
+    @property
+    def is_producer(self) -> bool:
+        return self._is_producer
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time
+        before the model execution. The metadata will be used for runtime
+        EC cache loading.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time
+        after the model execution.
+        """
+        self._connector_metadata = None
+
+    def _get_connector_metadata(self) -> ECConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_caches(
+        self,
+        ec_caches: dict[str, torch.Tensor],
+    ):
+        """
+        Initialize with the EC caches.
+        Args:
+            ec_caches: dictionary of encoder cache
+        """
+        # TODO: Implement this later for P2P feature
+        return
+
+    @abstractmethod
+    def start_load_caches(
+        self, encoder_cache: dict[str, torch.Tensor], **kwargs
+    ) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    @abstractmethod
+    def save_caches(
+        self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs
+    ) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return None, None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    @abstractmethod
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if encoder cache exists for each mm data of requests
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            A list bool where ith value is True if cache exist for
+            ith mm_data of requests
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request", index: int):
+        """
+        Update ECConnector state to decide allocate cache for requests
+
+        Args:
+            request (Request): the request object.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> ECConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
+
+    def update_connector_output(self, connector_output: ECConnectorOutput):
+        """
+        Update ECConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (ECConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self, request: "Request"
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished, before its encoder cache is freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and cached
+            should not be freed until the request_id is returned from
+            get_finished().
+        """
+        return False, None
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
new file mode 100644
index 0000000000000..bfdf51d775bda
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+# yapf: disable
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from vllm.config import ECTransferConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+
+class ECConnectorFactory:
+    _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {}
+
+    @classmethod
+    def register_connector(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[ECConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(
+        cls,
+        config: "VllmConfig",
+        role: ECConnectorRole,
+    ) -> ECConnectorBase:
+        ec_transfer_config = config.ec_transfer_config
+        if ec_transfer_config is None:
+            raise ValueError("ec_transfer_config must be set to create a connector")
+        connector_cls = cls.get_connector_class(ec_transfer_config)
+        logger.info(
+            "Creating connector with name: %s and engine_id: %s",
+            connector_cls.__name__,
+            ec_transfer_config.engine_id,
+        )
+        # Connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        return connector_cls(config, role)
+
+    @classmethod
+    def get_connector_class(
+        cls, ec_transfer_config: "ECTransferConfig"
+    ) -> type[ECConnectorBase]:
+        """Get the connector class by name."""
+        connector_name = ec_transfer_config.ec_connector
+        if connector_name is None:
+            raise ValueError("EC connect must not be None")
+        elif connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = ec_transfer_config.ec_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+
+ECConnectorFactory.register_connector(
+    "ECSharedStorageConnector",
+    "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector",
+    "ECSharedStorageConnector",
+)
diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
new file mode 100644
index 0000000000000..c8388141dcc97
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MMMeta:
+    mm_hash: str
+    num_token: int
+
+    @staticmethod
+    def make_meta(mm_hash, num_token) -> "MMMeta":
+        return MMMeta(mm_hash=mm_hash, num_token=num_token)
+
+
+@dataclass
+class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
+    mm_datas: list[MMMeta]
+
+    def __init__(self):
+        self.mm_datas = []
+
+    def add_mm_data(self, mm_data: MMMeta):
+        self.mm_datas.append(mm_data)
+
+
+class ECSharedStorageConnector(ECConnectorBase):
+    # NOTE: This is Simple debug implementation of the EC connector.
+    # It save / load the EC cache to / from the disk.
+
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        # req_id -> index
+        self._mm_datas_need_loads: dict[str, int] = {}
+        transfer_config = vllm_config.ec_transfer_config
+        if transfer_config is not None:
+            self._storage_path = transfer_config.get_from_extra_config(
+                "shared_storage_path", "/tmp"
+            )
+            logger.debug(transfer_config)
+            logger.debug("Shared storage path is %s", self._storage_path)
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    def start_load_caches(self, encoder_cache, **kwargs) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+
+        # Get the metadata
+        metadata: ECConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert encoder_cache is not None
+        if metadata is None:
+            logger.warning(
+                (
+                    "In connector.start_load_caches, ",
+                    "but the connector metadata is None",
+                )
+            )
+            return
+        # Load the EC for each mm data
+        for mm_data in metadata.mm_datas:
+            if mm_data.mm_hash in encoder_cache:
+                continue
+            filename = self._generate_filename_debug(mm_data.mm_hash)
+            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            encoder_cache[mm_data.mm_hash] = ec_cache
+            logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
+
+    def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        # Return if it is PD Instance
+        if not self.is_producer:
+            return
+        filename = self._generate_filename_debug(mm_hash)
+        ec_cache = encoder_cache[mm_hash]
+        tensors = {"ec_cache": ec_cache.detach().cpu()}
+        safetensors.torch.save_file(tensors, filename)
+        logger.debug("Save cache successful for mm_hash %s", mm_hash)
+
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if cache exist externally for each mm_data of request
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            List of bool indicate that ith mm_data exist in cache or not
+        """
+        result = []
+        for feature in request.mm_features:
+            result.append(self._found_match_for_mm_data(feature.identifier))
+        return result
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        index: int,
+    ) -> None:
+        """
+        Update ECConnector state after encoder cache allocation.
+        """
+        mm_hash = request.mm_features[index].identifier
+        num_encoder_token = request.get_num_encoder_tokens(index)
+        # Insert mm_hash only if this block has not been recorded yet.
+        self._mm_datas_need_loads[mm_hash] = num_encoder_token
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> ECConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        This only build for load mm_data only
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = ECSharedStorageConnectorMetadata()
+        for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
+            meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
+        self._mm_datas_need_loads.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_mm_data(self, mm_hash) -> bool:
+        """Check if the cache is hit for the request."""
+        filename = self._generate_filename_debug(mm_hash)
+        return os.path.exists(filename)
+
+    def _generate_foldername_debug(
+        self,
+        mm_hash: str,
+        create_folder: bool = True,  # <- now defaults to True
+    ) -> str:
+        """
+        Return the folder in which the cache for this mm_hash lives.
+        If `create_folder` is True (default) the directory is created
+        recursively the first time it is needed.
+        """
+        foldername = os.path.join(self._storage_path, mm_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(self, mm_hash: str) -> str:
+        """
+        Return the full path of the safetensors file for this mm_hash.
+        Ensures the parent directory exists because
+        `_generate_foldername_debug` is called with its default
+        (`create_folder=True`).
+        """
+        foldername = self._generate_foldername_debug(mm_hash)  # <- folder auto-created
+        return os.path.join(foldername, "encoder_cache.safetensors")
diff --git a/vllm/distributed/ec_transfer/ec_transfer_state.py b/vllm/distributed/ec_transfer/ec_transfer_state.py
new file mode 100644
index 0000000000000..ef3c978b36a3a
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_transfer_state.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_EC_CONNECTOR_AGENT: ECConnectorBase | None = None
+
+
+def get_ec_transfer() -> ECConnectorBase:
+    assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized"
+    return _EC_CONNECTOR_AGENT
+
+
+def has_ec_transfer() -> bool:
+    return _EC_CONNECTOR_AGENT is not None
+
+
+def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize EC cache connector.
+    """
+
+    global _EC_CONNECTOR_AGENT
+
+    if vllm_config.ec_transfer_config is None:
+        return
+
+    if (
+        vllm_config.ec_transfer_config.is_ec_transfer_instance
+        and _EC_CONNECTOR_AGENT is None
+    ):
+        _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector(
+            config=vllm_config, role=ECConnectorRole.WORKER
+        )
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
index c9d30d6481ab6..e6645e524cc3e 100644
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -12,6 +12,7 @@ Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
 on how the EPLB algorithm works.
 """
 
+import numpy as np
 import torch
 
 
@@ -34,29 +35,44 @@ def balanced_packing(
     assert num_groups % num_packs == 0
     groups_per_pack = num_groups // num_packs
 
+    device = weight.device
+
     if groups_per_pack == 1:
         pack_index = torch.arange(
-            weight.size(-1), dtype=torch.int64, device=weight.device
+            weight.size(-1), dtype=torch.int64, device=device
         ).expand(weight.shape)
-        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
         return pack_index, rank_in_pack
 
-    indices = weight.float().sort(-1, descending=True).indices.cpu()
-    pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
-    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    weight_np = weight.cpu().numpy()
+
+    # Sort and get indices in decending order
+    indices_np = np.argsort(-weight_np, axis=-1)
+
+    pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+    rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+
+    # Run the packing algorithm
     for i in range(num_layers):
-        pack_weights = [0] * num_packs
+        pack_weights = [0.0] * num_packs
         pack_items = [0] * num_packs
-        for group in indices[i]:
+
+        for group in indices_np[i]:
+            # Find a pack with capacity that has the lowest weight
             pack = min(
-                (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
                 key=pack_weights.__getitem__,
             )
+
             assert pack_items[pack] < groups_per_pack
-            pack_index[i, group] = pack
-            rank_in_pack[i, group] = pack_items[pack]
-            pack_weights[pack] += weight[i, group]
+            pack_index_np[i, group] = pack
+            rank_in_pack_np[i, group] = pack_items[pack]
+            pack_weights[pack] += weight_np[i, group]
             pack_items[pack] += 1
+
+    pack_index = torch.from_numpy(pack_index_np).to(device)
+    rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+
     return pack_index, rank_in_pack
 
 
@@ -212,7 +228,7 @@ def rebalance_experts(
             replicas for each logical expert
     """
     num_layers, num_logical_experts = weight.shape
-    weight = weight.float().cpu()
+    weight = weight.float()
     if num_groups % num_nodes == 0:
         # use hierarchical load-balance policy
         phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index f8ec3e956401a..5c1efbaf03bab 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -321,15 +321,19 @@ def rearrange_expert_weights_inplace(
             )
         return
 
+    old_global_expert_indices_cpu = old_global_expert_indices.cpu()
+    new_global_expert_indices_cpu = new_global_expert_indices.cpu()
+
+    # NOTE(bowen): We need this synchronize to run, but I don't know why.
+    # If you figure out the reason, please let me know -- thank you!
+    torch.cuda.synchronize()
+
     for layer in range(num_moe_layers):
-        # NOTE(bowen): We need this synchronize to run, but I don't know why.
-        # If you figure out the reason, please let me know -- thank you!
-        torch.cuda.synchronize()
         shuffle_layer(
             num_local_physical_experts,
             ep_rank,
-            old_global_expert_indices[layer].tolist(),
-            new_global_expert_indices[layer].tolist(),
+            old_global_expert_indices_cpu[layer].tolist(),
+            new_global_expert_indices_cpu[layer].tolist(),
             expert_weights[layer],
             expert_weights_buffer,
             ep_group,
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 494a4d3c33aa4..df871dd7cbe4f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -161,6 +161,12 @@ KVConnectorFactory.register_connector(
     "LMCacheConnectorV1",
 )
 
+KVConnectorFactory.register_connector(
+    "LMCacheMPConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_mp_connector",
+    "LMCacheMPConnector",
+)
+
 KVConnectorFactory.register_connector(
     "NixlConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 354aa9a87183d..f85eb414b2222 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -204,11 +204,18 @@ class KVConnectorBase_V1(ABC):
         Returns:
             ConnectorMetadata: the connector metadata.
         """
-
         # Should only be called while set to valid metadata.
         assert self._connector_metadata is not None
         return self._connector_metadata
 
+    def has_connector_metadata(self) -> bool:
+        """Check whether the connector metadata is currently set.
+
+        Returns:
+            bool: True if connector metadata exists, False otherwise.
+        """
+        return self._connector_metadata is not None
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         Initialize with the KV caches. Useful for pre-registering the
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 575ab468be566..0c24a53fb754b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -136,6 +136,21 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         """
         return self._lmcache_engine.get_finished(finished_req_ids)
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+        """
+        method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
+        if callable(method):
+            return method()
+
+        # Fallback for older versions that don't support this method
+        return set()
+
     # ==============================
     # Scheduler-side methods
     # ==============================
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
index 3c73a1c09e58d..07e05cc8f8932 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
@@ -2,6 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from . import vllm_v1_adapter
+from . import multi_process_adapter, vllm_v1_adapter
+from .multi_process_adapter import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
 
-__all__ = ["vllm_v1_adapter"]
+__all__ = [
+    "vllm_v1_adapter",
+    "multi_process_adapter",
+    "LMCacheMPSchedulerAdapter",
+    "LMCacheMPWorkerAdapter",
+    "LoadStoreOp",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
new file mode 100644
index 0000000000000..ab2eeed9f6b8a
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import islice
+from typing import Any
+
+import torch
+import zmq
+from lmcache.utils import _lmcache_nvtx_annotate, init_logger
+from lmcache.v1.multiprocess.custom_types import (
+    CudaIPCWrapper,
+    IPCCacheEngineKey,
+    KVCache,
+)
+from lmcache.v1.multiprocess.mq import MessageQueueClient, MessagingFuture
+from lmcache.v1.multiprocess.protocol import RequestType, get_response_class
+
+logger = init_logger(__name__)
+
+
+def wrap_kv_caches(kv_caches: dict[str, KVCache]) -> KVCache:
+    logger.info("KV caches keys are %s", list(kv_caches.keys()))
+    return [CudaIPCWrapper(tensor) for tensor in kv_caches.values()]
+
+
+def send_lmcache_request(
+    mq_client: MessageQueueClient,
+    request_type: RequestType,
+    payloads: list[Any],
+) -> MessagingFuture[Any]:
+    future = mq_client.submit_request(
+        request_type, payloads, get_response_class(request_type)
+    )
+    return future
+
+
+def get_lmcache_chunk_size(
+    mq_client: MessageQueueClient,
+) -> int:
+    future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, [])
+    chunk_size = future.result()
+    return chunk_size
+
+
+def striding_block_hashes(
+    block_hashes: list[bytes],
+    blocks_in_chunk,
+) -> Iterable[bytes]:
+    """Striding the block hashes to get the block hashes for each chunk.
+    For example, if blocks_in_chunk is 16, then we will get the block hashes
+    for the 16th, 32nd, 48th, ... blocks.
+    """
+    return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk)
+
+
+@dataclass
+class LoadStoreOp:
+    block_hashes: list[bytes]
+    block_ids: list[int]
+
+    def __len__(self) -> int:
+        return len(self.block_hashes)
+
+    def __post_init__(self):
+        assert len(self.block_hashes) == len(self.block_ids), (
+            "The number of block hashes should be equal to the number of block ids "
+            f"But got {len(self.block_hashes)} and {len(self.block_ids)}"
+        )
+
+
+StoreResult = bool
+RetrieveResult = list[bool]
+LookupResult = list[bool]
+
+
+class LMCacheMPSchedulerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        """
+        Args:
+            server_url: The server URL for the LMCache message queue
+            context: The ZMQ context
+
+            model_name: The model name used for LMCache keys
+            world_size: The world size used for LMCache keys
+            kv_rank: The kv rank used for LMCache keys
+            vllm_block_size: The block size used in vLLM
+        """
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Request futures
+        self.lookup_futures: dict[str, MessagingFuture[LookupResult]] = {}
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        self.chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert self.chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = self.chunk_size // vllm_block_size
+
+    @_lmcache_nvtx_annotate
+    def maybe_submit_lookup_request(self, request_id: str, block_hashes: list[bytes]):
+        if request_id in self.lookup_futures:
+            # Skip if there is already a lookup request
+            return
+
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        keys = [self._create_key(block_hash) for block_hash in s]
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.LOOKUP,
+            [keys, True],
+        )
+        self.lookup_futures[request_id] = future
+
+    @_lmcache_nvtx_annotate
+    def check_lookup_result(self, request_id: str) -> int | None:
+        assert request_id in self.lookup_futures, (
+            f"Lookup request for request_id={request_id} has not been submitted"
+        )
+
+        future = self.lookup_futures[request_id]
+        if not future.query():
+            return None
+
+        result = future.result()
+        num_chunks = sum(result)
+        return num_chunks * self.chunk_size
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    # Helper functions
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+
+class LMCacheMPWorkerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Instance id for GPU worker
+        self.instance_id = os.getpid()
+
+        # Registered kv caches from vLLM
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        # Request futures
+        # request_id -> (future, other merged requests)
+        self.store_futures: dict[
+            str, tuple[MessagingFuture[StoreResult], list[str]]
+        ] = {}
+        self.retrieve_futures: dict[
+            str, tuple[MessagingFuture[RetrieveResult], list[str]]
+        ] = {}
+
+        self.finished_stores: set[str] = set()
+        self.previously_finished: set[str] = set()
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = chunk_size // vllm_block_size
+
+    def register_kv_caches(self, kv_caches: dict[str, KVCache]):
+        # Register kv cache and send the request
+        self.kv_caches = kv_caches
+        logger.info("Registering kv caches")
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.REGISTER_KV_CACHE,
+            [self.instance_id, wrap_kv_caches(kv_caches)],
+        )
+        future.result()
+
+    @_lmcache_nvtx_annotate
+    def submit_store_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def submit_retrieve_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_store_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_retrieve_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        finished_stores = set()
+        finished_retrieves = set()
+        for request_id, (future, other_reqs) in self.store_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_stores.add(request_id)
+            finished_stores.update(other_reqs)
+
+            if not result:
+                # TODO: add error handling here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "store request for request_id=%s",
+                    request_id,
+                )
+
+        for request_id, (future, other_reqs) in self.retrieve_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_retrieves.add(request_id)
+            finished_retrieves.update(other_reqs)
+
+            if not all(result):
+                # TODO: add error handing here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "retrieve request for request_id=%s, result=%s",
+                    request_id,
+                    result,
+                )
+            logger.info("Retrieve request for request_id=%s finished", request_id)
+
+        # Remove the finished requests from the tracking dicts
+        for request_id in finished_stores:
+            self.store_futures.pop(request_id, None)
+        for request_id in finished_retrieves:
+            self.retrieve_futures.pop(request_id, None)
+
+        # Update the internal states
+        self.finished_stores.update(finished_stores)
+
+        ret_stores = set()
+        for req_id in finished_req_ids:
+            if req_id in self.finished_stores or req_id in self.store_futures:
+                self.previously_finished.add(req_id)
+            else:
+                ret_stores.add(req_id)
+
+        # Calculate the final finished stores
+        ret_stores.update(self._update_and_get_finished_store())
+
+        return ret_stores, finished_retrieves
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    def shutdown(self):
+        # Unregister kv cache
+        logger.info("Unregistering kv caches")
+        send_lmcache_request(
+            self.mq_client, RequestType.UNREGISTER_KV_CACHE, [self.instance_id]
+        ).result()
+
+        self.mq_client.close()
+
+    # Helper functions
+    def _update_and_get_finished_store(
+        self,
+    ) -> set[str]:
+        """Converge the internal states about finished stores
+        and returns the 'safe finished store request ids' back
+        """
+        safe_finished_s = self.finished_stores.intersection(self.previously_finished)
+        self.finished_stores.difference_update(self.previously_finished)
+        self.previously_finished.difference_update(safe_finished_s)
+
+        return safe_finished_s
+
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+    def _block_hashes_to_keys(
+        self, block_hashes: list[bytes]
+    ) -> list[IPCCacheEngineKey]:
+        """Convert block hashes to IPC cache engine keys"""
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        return [self._create_key(block_hash) for block_hash in s]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
new file mode 100644
index 0000000000000..55831dc56c803
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -0,0 +1,867 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
+import torch
+import zmq
+from lmcache.utils import init_logger as lmcache_init_logger
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.utils import ConstantList
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+        KVConnectorPromMetrics,
+        KVConnectorStats,
+        PromMetric,
+        PromMetricT,
+    )
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.core.kv_cache_utils import BlockHash
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = lmcache_init_logger(__name__)
+
+
+# Helper functions
+def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
+    if block_ids is None:
+        return []
+    assert isinstance(block_ids, tuple), (
+        f"Expected block_ids to be a tuple of lists, but got {type(block_ids)}"
+    )
+
+    if len(block_ids) > 1:
+        raise RuntimeError(
+            "LMCacheMPConnector only works without hybrid kv cache manager. "
+            "Please pass --disable-hybrid-kv-cache-manager when starting vllm"
+        )
+
+    return block_ids[0]
+
+
+def create_scheduler_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPSchedulerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPSchedulerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def create_worker_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPWorkerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPWorkerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def convert_block_hashes_to_bytes(
+    block_hashes: list["BlockHash"],
+) -> list[bytes]:
+    return cast(list[bytes], block_hashes)
+
+
+class LMCacheMPRequestState(enum.Enum):
+    """
+    State machine:
+    PREFETCHING -- update_state_after_alloc --> WAITING_FOR_LOAD
+    WAITING_FOR_LOAD -- process_loading_requests --> READY
+    """
+
+    PREFETCHING = enum.auto()
+    WAITING_FOR_LOAD = enum.auto()
+    READY = enum.auto()
+
+
+@dataclass
+class LMCacheMPRequestTracker:
+    # NOTE: this class used vLLM data structures, should be part of
+    # vLLM integration code
+
+    request_id: str
+
+    # Read-only lists to track the token ids and block hashes
+    all_token_ids: ConstantList[int]
+    block_hashes: ConstantList["BlockHash"]
+
+    # Block ids and hashes will be updated at update_states_after_alloc and
+    # during the generation
+    allocated_block_ids: list[int] = field(default_factory=list)
+
+    # Number of scheduled tokens in this request. We keep tracking this to
+    # avoid saving half-full blocks.
+    num_scheduled_tokens: int = 0
+
+    # Number of blocks stored will be initialized when lookup the external
+    # hit tokens and will be updated when processing new requests and cached
+    # requests.
+    num_stored_blocks: int = 0
+
+    # Staging load operation -- save vllm and lmcache hit tokens during lookup
+    num_vllm_hit_blocks: int = 0
+    num_lmcache_hit_blocks: int = 0
+
+    # Main state
+    state: LMCacheMPRequestState = LMCacheMPRequestState.PREFETCHING
+
+    def __init__(self, request: "Request"):
+        self.request_id = request.request_id
+        self.all_token_ids = request.all_token_ids
+        self.block_hashes = ConstantList(request.block_hashes)
+        self.allocated_block_ids = []
+        self.num_stored_blocks = 0
+        self.num_vllm_hit_blocks = 0
+        self.num_lmcache_hit_blocks = 0
+        self.state = LMCacheMPRequestState.PREFETCHING
+
+    ####
+    # Check the state of the request
+    ####
+    def needs_retrieve(self) -> bool:
+        """Check whether the current request needs retrieve, will be used
+        update_stage_after_alloc"""
+        return (
+            self.num_lmcache_hit_blocks > self.num_vllm_hit_blocks
+            and self.state != LMCacheMPRequestState.READY
+        )
+
+    def is_ready_for_retrieving(self) -> bool:
+        """Check whether the current request is ready for retrieving,
+        will be used in process_loading_requests"""
+        return (
+            self.state == LMCacheMPRequestState.WAITING_FOR_LOAD
+            and self.needs_retrieve()
+        )
+
+    ####
+    # Update internal states
+    ####
+    def increase_num_scheduled_tokens(self, num_new_tokens: int):
+        self.num_scheduled_tokens += num_new_tokens
+
+    def increase_num_stored_blocks(self, num_new_blocks: int):
+        """Increase the number of stored blocks for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.num_stored_blocks += num_new_blocks
+
+    def update_block_ids(
+        self,
+        new_block_ids: list[int],
+    ):
+        """Update the block ids for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.allocated_block_ids.extend(new_block_ids)
+
+    ####
+    # For debugging
+    ####
+    def __repr__(self) -> str:
+        return (
+            f"LMCacheMPRequestTracker(request_id={self.request_id}, "
+            f"num_tokens={len(self.all_token_ids)}, "
+            f"num_block_hashes={len(self.block_hashes)}, "
+            f"num_allocated_blocks={len(self.allocated_block_ids)}, "
+            f"num_stored_blocks={self.num_stored_blocks}, "
+            f"vllm_hit_blocks={self.num_vllm_hit_blocks}, "
+            f"lmcache_hit_blocks={self.num_lmcache_hit_blocks}, "
+            f"state={self.state})"
+        )
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+@dataclass
+class LMCacheMPRequestMetadata:
+    request_id: str
+    direction: Literal["STORE", "RETRIEVE"]
+    op: LoadStoreOp
+
+    @staticmethod
+    def GetStoreMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+        vllm_block_size: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the store metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        # Store the blocks that has block hashes
+        # NOTE: the invariant here is that `num_stored_blocks` should
+        # always be a multiple of `blocks_in_chunk`
+        # TODO: This should be checked everytime we update the num_stored_blocks
+        min_available_blocks = min(
+            len(tracker.block_hashes),
+            len(tracker.allocated_block_ids),
+            tracker.num_scheduled_tokens // vllm_block_size,
+        )
+        num_staging_blocks = min_available_blocks - tracker.num_stored_blocks
+        num_chunks = num_staging_blocks // blocks_in_chunk
+
+        if num_chunks >= 1:
+            start = tracker.num_stored_blocks
+            end = start + num_chunks * blocks_in_chunk
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="STORE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+
+            # Update the request tracker
+            tracker.increase_num_stored_blocks(end - start)
+            return ret
+
+        return None
+
+    @staticmethod
+    def GetRetrieveMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the retrieve metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        if not tracker.is_ready_for_retrieving():
+            return None
+
+        # |---------------------|-----------------|----------------|
+        # | num_vllm_hit_blocks |
+        # | lmcache chunk 1   | lmcache chunk 2   |
+        #                     |  need to retrieve |
+
+        start = tracker.num_vllm_hit_blocks // blocks_in_chunk * blocks_in_chunk
+        end = tracker.num_lmcache_hit_blocks
+        assert end % blocks_in_chunk == 0, (
+            "The number of LMCache hit blocks should be a multiple of the "
+            "number of blocks in a lmcache chunk. "
+        )
+        assert len(tracker.block_hashes) >= end, (
+            "The number of block hashes should be greater than or equal to the "
+            "number of LMCache hit blocks. "
+        )
+        if end > start:
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="RETRIEVE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+            return ret
+
+        return None
+
+
+class LMCacheMPConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        super().__init__()
+        self.requests: list[LMCacheMPRequestMetadata] = []
+
+    def add_request_metadata(self, request_metadata: LMCacheMPRequestMetadata):
+        self.requests.append(request_metadata)
+
+    def __len__(self):
+        return len(self.requests)
+
+    # For debugging
+    def __str__(self):
+        request_strs = []
+        for req_meta in self.requests:
+            request_strs.append(
+                f"RequestMetadata(request_id={req_meta.request_id}, "
+                f"direction={req_meta.direction}, "
+                f"num_blocks={len(req_meta.op)}, "
+                f"block_ids={req_meta.op.block_ids})"
+            )
+        return "[" + "\n".join(request_strs) + "]"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class LMCacheMPConnector(KVConnectorBase_V1):
+    """
+    The connector for LMCache multi-process mode.
+
+    Extra configs (kv_transfer_config.extra_config):
+    - lmcache.mp.host: the host of the LMCache server.
+    - lmcache.mp.port: the port of the LMCache server.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        assert vllm_config.kv_transfer_config is not None
+        server_host = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.host", "tcp://localhost"
+        )
+        server_port = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.port", 5555
+        )
+
+        server_url = f"{server_host}:{server_port}"
+        zmq_context = zmq.Context.instance()
+        if self.role == KVConnectorRole.SCHEDULER:
+            self.scheduler_adapter = create_scheduler_adapter(
+                server_url, zmq_context, vllm_config
+            )
+            self.request_trackers: dict[str, LMCacheMPRequestTracker] = {}
+        elif self.role == KVConnectorRole.WORKER:
+            self.worker_adapter = create_worker_adapter(
+                server_url, zmq_context, vllm_config
+            )
+        else:
+            raise ValueError(f"Unknown KVConnectorRole: {self.role}")
+
+        self.vllm_block_size = vllm_config.cache_config.block_size
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args:
+            kv_caches: dictionary of layer names, kv cache
+        """
+        logger.info("Registering kv caches!")
+        self.worker_adapter.register_kv_caches(kv_caches)
+        return
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+
+        for meta in metadata.requests:
+            if meta.direction != "RETRIEVE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            logger.info(
+                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
+            )
+            self.worker_adapter.batched_submit_retrieve_requests(
+                request_ids, ops, event
+            )
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        return
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+        for meta in metadata.requests:
+            if meta.direction != "STORE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        val = self.worker_adapter.get_finished(finished_req_ids)
+        # logger.error("Finished req ids: %s, %s", val[0], val[1])
+        return val
+
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+
+        Notes:
+            - Applies to both sync- and async-loading requests.
+            - Async loading: failed blocks may be reported in any forward pass
+              up to and including the pass where the request ID is returned by
+              `get_finished()`. Even if failures occur, the request must still
+              be reported via `get_finished()`, and the failed block IDs must
+              appear here no later than that same pass.
+            - Sync loading: failed blocks should be reported in the forward
+              pass in which they are detected.
+        """
+        # TODO: add error tracking
+        return set()
+
+    def shutdown(self):
+        """
+        Shutdown the connector. This is called when the worker process
+        is shutting down to ensure that all the async operations are
+        completed and the connector is cleaned up properly.
+        """
+        if hasattr(self, "worker_adapter"):
+            self.worker_adapter.shutdown()
+        return None
+
+    def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
+        """
+        Get the KV connector stats collected during the last interval.
+        """
+        return None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            A tuple with the following elements:
+                - An optional number of tokens that can be loaded from the
+                  external KV cache beyond what is already computed.
+                  If None, it means that the connector needs more time to
+                  determine the number of matched tokens, and the scheduler
+                  should query for this request again later.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
+
+        Notes:
+            The connector should only consider the largest prefix of prompt-
+            tokens for which KV cache is actually available at the time of the
+            call. If the cache cannot be loaded for some tokens (e.g., due to
+            connectivity issues or eviction), those tokens must not be taken
+            into account.
+        """
+        tracker = self._get_or_create_request_tracker(request)
+
+        self.scheduler_adapter.maybe_submit_lookup_request(
+            request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
+        )
+
+        ret = self.scheduler_adapter.check_lookup_result(request.request_id)
+        if ret is None:
+            return None, True
+
+        if ret == 0:
+            return 0, False
+
+        assert (
+            ret % (self.scheduler_adapter.num_blocks_per_chunk() * self.vllm_block_size)
+            == 0
+        )
+
+        # Update num stored blocks for the tracker
+        num_vllm_blocks = num_computed_tokens // self.vllm_block_size
+        num_lmcache_blocks = ret // self.vllm_block_size
+        tracker.increase_num_stored_blocks(num_lmcache_blocks)
+
+        # Save the vllm and lmcache hit tokens
+        tracker.num_vllm_hit_blocks = num_vllm_blocks
+        tracker.num_lmcache_hit_blocks = num_lmcache_blocks
+
+        need_to_load = max(0, ret - num_computed_tokens)
+        logger.debug(
+            "vLLM hit is: %d, Need to load is %d", num_computed_tokens, need_to_load
+        )
+        return need_to_load, need_to_load > 0
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        """
+        Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        # NOTE: the `blocks` are NEW BLOCKS allocated for this request.
+        tracker = self._get_request_tracker(request.request_id)
+        block_ids = reformat_block_ids(blocks.get_block_ids())
+
+        # No matter we need to retrieve or not, we need to update
+        # the block ids into the tracker
+        tracker.update_block_ids(block_ids)
+
+        # Update the state of the tracker
+        condition = tracker.needs_retrieve()
+        if tracker.state == LMCacheMPRequestState.PREFETCHING:
+            # If need to retrieve, change to WAITING_FOR_LOAD
+            # Otherwise, change to READY
+            tracker.state = (
+                LMCacheMPRequestState.WAITING_FOR_LOAD
+                if condition
+                else LMCacheMPRequestState.READY
+            )
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        metadata = LMCacheMPConnectorMetadata()
+
+        self._process_retrieve_requests(metadata)
+        self._process_new_requests(scheduler_output, metadata)
+        self._process_cached_requests(scheduler_output, metadata)
+
+        if len(metadata) > 0:
+            logger.debug("Final connector metadata: %s", metadata)
+
+        return metadata
+
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called exactly once when a request has finished, before its blocks are
+        freed.
+
+        The connector may assumes responsibility for freeing the blocks
+        asynchronously by returning True.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return True, None
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        return ()
+
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+
+        if cls is KVConnectorBase_V1:
+            raise TypeError(
+                "get_required_kvcache_layout should not be called "
+                "on the abstract base class"
+            )
+        return None
+
+    def get_finished_count(self) -> int | None:
+        """
+        Get the count of requests expected to complete send/receive operations
+        via this connector. This method is used to initialize the
+        KVOutputAggregator, overwriting the default world_size.
+
+        Returns:
+            int: expected sending or receiving completion count.
+        """
+        return None
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> Optional["KVConnectorStats"]:
+        """
+        KVConnectorStats resolution method. This method allows dynamically
+        registered connectors to return their own KVConnectorStats object,
+        which can implement custom aggregation logic on the data dict.
+        """
+        return None
+
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type["PromMetric"], type["PromMetricT"]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> Optional["KVConnectorPromMetrics"]:
+        """
+        Create a KVConnectorPromMetrics subclass which should register
+        per-connector Prometheus metrics and implement observe() to
+        expose connector transfer stats via Prometheus.
+        """
+        return None
+
+    ##############################
+    # Helper functions
+    ##############################
+    def _process_retrieve_requests(
+        self,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for request_tracker in self.request_trackers.values():
+            if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD:
+                continue
+            r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata(
+                request_tracker, blocks_per_chunk
+            )
+            if r_metadata is not None:
+                metadata.add_request_metadata(r_metadata)
+            request_tracker.state = LMCacheMPRequestState.READY
+
+    def _process_new_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for new_request in scheduler_output.scheduled_new_reqs:
+            request_tracker = self._get_request_tracker(new_request.req_id)
+
+            num_new_tokens = scheduler_output.num_scheduled_tokens[new_request.req_id]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _process_cached_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for idx, request_id in enumerate(cached_reqs.req_ids):
+            request_tracker = self._get_request_tracker(request_id)
+
+            # Update block ids
+            new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
+            request_tracker.update_block_ids(new_block_ids)
+
+            # Update new scheduled tokens
+            num_new_tokens = cached_reqs.num_computed_tokens[idx]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _get_request_tracker(self, request_id: str) -> LMCacheMPRequestTracker:
+        assert request_id in self.request_trackers, (
+            f"Request tracker for request_id {request_id} not found. "
+        )
+        return self.request_trackers[request_id]
+
+    def _get_or_create_request_tracker(
+        self, request: "Request"
+    ) -> LMCacheMPRequestTracker:
+        request_id = request.request_id
+        if request_id not in self.request_trackers:
+            new_tracker = LMCacheMPRequestTracker(request)
+            self.request_trackers[request_id] = new_tracker
+        return self.request_trackers[request_id]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index d7bbf02c83677..c9d08e9b78ed0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -171,16 +171,22 @@ class MultiConnector(KVConnectorBase_V1):
     # We must override the base class method here because we need to bind
     # the metadata to each connector in the order of the connectors in the
     # MultiKVConnectorMetadata.
+    #
+    # Note: Call the base class method to ensure metadata is also set on the
+    # MultiConnector instance itself; otherwise, `has_connector_metadata()` will
+    # always return False.
     def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
         assert isinstance(connector_metadata, MultiKVConnectorMetadata)
         if connector_metadata.extra_async_saves:
             self._extra_async_saves.update(connector_metadata.extra_async_saves)
         for c, cm in zip(self._connectors, connector_metadata.metadata):
             c.bind_connector_metadata(cm)
+        super().bind_connector_metadata(connector_metadata)
 
     def clear_connector_metadata(self) -> None:
         for c in self._connectors:
             c.clear_connector_metadata()
+        super().clear_connector_metadata()
 
     def shutdown(self):
         exception: Exception | None = None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff9770b72bd38..42433c717cf26 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -21,7 +21,8 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -90,6 +91,7 @@ _NIXL_SUPPORTED_DEVICE = {
     ),
     "tpu": ("cpu",),
     "xpu": ("cpu",),
+    "cpu": ("cpu",),
 }
 # support for oot platform by providing mapping in current_platform
 _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
@@ -347,7 +349,13 @@ class NixlConnectorScheduler:
             + vllm_config.parallel_config.data_parallel_rank
         )
         assert vllm_config.kv_transfer_config is not None
-        self.use_host_buffer = vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+        if current_platform.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = (
+                vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+            )
+
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
         # Background thread for handling new handshake requests.
@@ -669,6 +677,33 @@ class NixlConnectorWorker:
         remote_tp_size: dict[EngineId, int]
         is_mla: bool
         total_num_kv_heads: int
+        attn_backend: type[AttentionBackend]
+
+        def __post_init__(self):
+            # Figure out whether the first dimension of the cache is K/V
+            # or num_blocks. This is used to register the memory regions correctly.
+            kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+                num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+            )
+            # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
+            # we just mock num_blocks to 1 for the dimension check below.
+            self._is_kv_layout_blocks_first = (
+                len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
+            )
+
+            attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
+            self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
+
+        @property
+        def is_kv_layout_blocks_first(self) -> bool:
+            return self._is_kv_layout_blocks_first
+
+        @property
+        def split_k_and_v(self) -> bool:
+            # Whether to register regions for K and V separately (when present).
+            return not (
+                self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
+            )
 
         def tp_ratio(
             self,
@@ -792,7 +827,11 @@ class NixlConnectorWorker:
         # cpu kv buffer for xfer
         # used when device memory can not be registered under nixl
         self.host_xfer_buffers: dict[str, torch.Tensor] = {}
-        self.use_host_buffer = self.kv_buffer_device == "cpu"
+        if self.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = self.kv_buffer_device == "cpu"
+
         # support for oot platform which can't register nixl memory
         # type based on kv_buffer_device
         nixl_memory_type = current_platform.get_nixl_memory_type()
@@ -876,9 +915,6 @@ class NixlConnectorWorker:
             use_mla=self.use_mla,
         )
         self.backend_name = backend.get_name()
-        attn_backend = backend_name_to_enum(self.backend_name)
-        self._use_flashinfer = attn_backend == _Backend.FLASHINFER
-        self._use_pallas = attn_backend == _Backend.PALLAS
         self.kv_cache_layout = get_kv_cache_layout()
         self.host_buffer_kv_cache_layout = self.kv_cache_layout
         logger.debug("Detected attention backend %s", self.backend_name)
@@ -896,7 +932,9 @@ class NixlConnectorWorker:
             remote_tp_size=self._tp_size,  # shared state
             is_mla=self.use_mla,
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            attn_backend=backend,
         )
+        self._use_pallas = self.kv_topo._use_pallas
 
     def _nixl_handshake(
         self,
@@ -994,6 +1032,9 @@ class NixlConnectorWorker:
         # Set a no-op if the host buffer is not cpu.
         if self.kv_buffer_device != "cpu":
             return
+        # Set a no-op if self.device_type is 'cpu'.
+        if self.device_type == "cpu":
+            return
         assert self.use_host_buffer
         self.copy_blocks = copy_operation
 
@@ -1076,7 +1117,7 @@ class NixlConnectorWorker:
         # (roughly 8KB vs 5KB).
         # Conversely for FlashInfer, K and V are registered in the same region
         # to better exploit the memory layout (ie num_blocks is the first dim).
-        split_k_and_v = not (self.use_mla or self._use_pallas or self._use_flashinfer)
+        split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
@@ -1141,7 +1182,7 @@ class NixlConnectorWorker:
 
         self.device_kv_caches = kv_caches
         self.dst_num_blocks[self.engine_id] = self.num_blocks
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             for i in range(len(self.slot_size_per_layer)):
                 assert self.slot_size_per_layer[i] % 2 == 0
                 self.slot_size_per_layer[i] //= 2
@@ -1169,7 +1210,7 @@ class NixlConnectorWorker:
                 # (addr, len, device id)
                 blocks_data.append((addr, kv_block_len, self.device_id))
 
-            if self._use_flashinfer:
+            if self.kv_topo.is_kv_layout_blocks_first:
                 # Separate and interleave K/V regions to maintain the same
                 # descs ordering. This is needed for selecting contiguous heads
                 # when split across TP ranks.
@@ -1331,7 +1372,7 @@ class NixlConnectorWorker:
                 # (addr, len, device id)
                 blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
 
-            if self._use_flashinfer:
+            if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
                 for block_id in range(nixl_agent_meta.num_blocks):
                     block_offset = block_id * nixl_agent_meta.block_lens[i]
@@ -1414,7 +1455,7 @@ class NixlConnectorWorker:
             remote_block_size = remote_block_len // (
                 self.slot_size_per_layer[0] * tp_ratio
             )
-            if self._use_flashinfer:
+            if self.kv_topo.is_kv_layout_blocks_first:
                 # With flashinfer, KV are sent in the same message.
                 remote_block_size //= 2
 
@@ -1494,7 +1535,7 @@ class NixlConnectorWorker:
         - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
 
         """
-        split_k_and_v = not (self.use_mla or self._use_pallas or self._use_flashinfer)
+        split_k_and_v = self.kv_topo.split_k_and_v
         inv_order = [0, 2, 1, 3]
         sample_cache = list(self.device_kv_caches.values())[0][0]
         target_shape = list(sample_cache.shape)
@@ -1874,7 +1915,7 @@ class NixlConnectorWorker:
         For FlashInfer, this is half the length of the whole block, as K and V
         share the same region.
         """
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             # For indexing only half (either just the K or V part).
             block_len = self.block_len_per_layer[layer_idx] // 2
         else:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b12b7082af627..13c7704f5bf3d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -32,12 +32,13 @@ from pydantic.fields import FieldInfo
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
     ConfigType,
     DeviceConfig,
+    ECTransferConfig,
     EPLBConfig,
     KVEventsConfig,
     KVTransferConfig,
@@ -462,7 +463,7 @@ class EngineArgs:
         MultiModalConfig.mm_shm_cache_max_object_size_mb
     )
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
-    mm_encoder_attn_backend: _Backend | str | None = (
+    mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
     io_processor_plugin: str | None = None
@@ -527,6 +528,8 @@ class EngineArgs:
     kv_transfer_config: KVTransferConfig | None = None
     kv_events_config: KVEventsConfig | None = None
 
+    ec_transfer_config: ECTransferConfig | None = None
+
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
     override_generation_config: dict[str, Any] = get_field(
@@ -1105,6 +1108,9 @@ class EngineArgs:
             "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]
         )
         vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"])
+        vllm_group.add_argument(
+            "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
+        )
         vllm_group.add_argument(
             "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
         )
@@ -1676,6 +1682,7 @@ class EngineArgs:
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
+            ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
         )
 
@@ -1726,9 +1733,6 @@ class EngineArgs:
                 )
                 _raise_unsupported_error(feature_name=name)
 
-        if current_platform.is_cpu() and model_config.get_sliding_window() is not None:
-            _raise_unsupported_error(feature_name="sliding window (CPU backend)")
-
     def _set_default_args(
         self, usage_context: UsageContext, model_config: ModelConfig
     ) -> None:
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 0041db822080a..7a41c668d7645 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -80,7 +80,11 @@ class TurnMetrics:
 
 class ConversationContext(ABC):
     @abstractmethod
-    def append_output(self, output) -> None:
+    def append_output(self, output: RequestOutput) -> None:
+        pass
+
+    @abstractmethod
+    def append_tool_output(self, output) -> None:
         pass
 
     @abstractmethod
@@ -151,6 +155,9 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+    def append_tool_output(self, output) -> None:
+        raise NotImplementedError("Should not be called.")
+
     def need_builtin_tool_call(self) -> bool:
         return False
 
@@ -205,28 +212,28 @@ class HarmonyContext(ConversationContext):
         if self.parser.current_channel in {"analysis", "commentary"}:
             self.num_reasoning_tokens += 1
 
-    def append_output(self, output: RequestOutput | list[Message]) -> None:
-        if isinstance(output, RequestOutput):
-            output_token_ids = output.outputs[0].token_ids
-            self.parser = get_streamable_parser_for_assistant()
-            for token_id in output_token_ids:
-                self.parser.process(token_id)
-                # Check if the current token is part of reasoning content
-                self._update_num_reasoning_tokens()
-            self._update_prefill_token_usage(output)
-            self._update_decode_token_usage(output)
-            # Append current turn to all turn list for next turn's calculations
-            self.all_turn_metrics.append(self.current_turn_metrics.copy())
-            self.current_turn_metrics.reset()
-            # append_output is called only once before tool calling
-            # in non-streaming case
-            # so we can append all the parser messages to _messages
-            output_msgs = self.parser.messages
-            # The responses finish reason is set in the last message
-            self.finish_reason = output.outputs[0].finish_reason
-        else:
-            # Tool output.
-            output_msgs = output
+    def append_output(self, output: RequestOutput) -> None:
+        output_token_ids = output.outputs[0].token_ids
+        self.parser = get_streamable_parser_for_assistant()
+        for token_id in output_token_ids:
+            self.parser.process(token_id)
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens()
+        self._update_prefill_token_usage(output)
+        self._update_decode_token_usage(output)
+        # Append current turn to all turn list for next turn's calculations
+        self.all_turn_metrics.append(self.current_turn_metrics.copy())
+        self.current_turn_metrics.reset()
+        # append_output is called only once before tool calling
+        # in non-streaming case
+        # so we can append all the parser messages to _messages
+        output_msgs = self.parser.messages
+        # The responses finish reason is set in the last message
+        self.finish_reason = output.outputs[0].finish_reason
+        self._messages.extend(output_msgs)
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        output_msgs = output
         self._messages.extend(output_msgs)
 
     def _update_prefill_token_usage(self, output: RequestOutput) -> None:
@@ -502,45 +509,45 @@ class StreamingHarmonyContext(HarmonyContext):
     def messages(self) -> list:
         return self._messages
 
-    def append_output(self, output: RequestOutput | list[Message]) -> None:
-        if isinstance(output, RequestOutput):
-            # append_output is called for each output token in streaming case,
-            # so we only want to add the prompt tokens once for each message.
-            if self.first_tok_of_message:
-                self._update_prefill_token_usage(output)
-            # Reset self.first_tok_of_message if needed:
-            # if the current token is the last one of the current message
-            # (finished=True), then the next token processed will mark the
-            # beginning of a new message
-            self.first_tok_of_message = output.finished
-            for tok in output.outputs[0].token_ids:
-                self.parser.process(tok)
-            self._update_decode_token_usage(output)
+    def append_output(self, output: RequestOutput) -> None:
+        # append_output is called for each output token in streaming case,
+        # so we only want to add the prompt tokens once for each message.
+        if self.first_tok_of_message:
+            self._update_prefill_token_usage(output)
+        # Reset self.first_tok_of_message if needed:
+        # if the current token is the last one of the current message
+        # (finished=True), then the next token processed will mark the
+        # beginning of a new message
+        self.first_tok_of_message = output.finished
+        for tok in output.outputs[0].token_ids:
+            self.parser.process(tok)
+        self._update_decode_token_usage(output)
 
-            # For streaming, update previous turn when message is complete
-            if output.finished:
-                self.all_turn_metrics.append(self.current_turn_metrics.copy())
-                self.current_turn_metrics.reset()
-            # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens()
-            self.last_tok = tok
-            if len(self._messages) - self.num_init_messages < len(self.parser.messages):
-                self._messages.extend(
-                    self.parser.messages[len(self._messages) - self.num_init_messages :]
-                )
-        else:
-            # Handle the case of tool output in direct message format
-            assert len(output) == 1, "Tool output should be a single message"
-            msg = output[0]
-            # Sometimes the recipient is not set for tool messages,
-            # so we set it to "assistant"
-            if msg.author.role == Role.TOOL and msg.recipient is None:
-                msg.recipient = "assistant"
-            toks = self.encoding.render(msg)
-            for tok in toks:
-                self.parser.process(tok)
-            self.last_tok = toks[-1]
-            # TODO: add tool_output messages to self._messages
+        # For streaming, update previous turn when message is complete
+        if output.finished:
+            self.all_turn_metrics.append(self.current_turn_metrics.copy())
+            self.current_turn_metrics.reset()
+        # Check if the current token is part of reasoning content
+        self._update_num_reasoning_tokens()
+        self.last_tok = tok
+        if len(self._messages) - self.num_init_messages < len(self.parser.messages):
+            self._messages.extend(
+                self.parser.messages[len(self._messages) - self.num_init_messages :]
+            )
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        # Handle the case of tool output in direct message format
+        assert len(output) == 1, "Tool output should be a single message"
+        msg = output[0]
+        # Sometimes the recipient is not set for tool messages,
+        # so we set it to "assistant"
+        if msg.author.role == Role.TOOL and msg.recipient is None:
+            msg.recipient = "assistant"
+        toks = self.encoding.render(msg)
+        for tok in toks:
+            self.parser.process(tok)
+        self.last_tok = toks[-1]
+        # TODO: add tool_output messages to self._messages
 
     def is_expecting_start(self) -> bool:
         return self.parser.state == StreamState.EXPECT_START
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 22fe2ae9280aa..62717a7eacdf0 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -23,6 +23,7 @@ from vllm.config import (
     StructuredOutputsConfig,
     is_init_field,
 )
+from vllm.config.compilation import CompilationMode
 from vllm.config.model import (
     ConvertOption,
     HfOverrides,
@@ -259,7 +260,9 @@ class LLM:
 
         if compilation_config is not None:
             if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(mode=compilation_config)
+                compilation_config_instance = CompilationConfig(
+                    mode=CompilationMode(compilation_config)
+                )
             elif isinstance(compilation_config, dict):
                 compilation_config_instance = CompilationConfig(
                     **{
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 30b8499b08d5b..1456727a3cdd6 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1227,7 +1227,7 @@ class OpenAIServing:
 
             # Call the tool and update the context with the result.
             tool_output = await context.call_tool()
-            context.append_output(tool_output)
+            context.append_tool_output(tool_output)
 
             # TODO: uncomment this and enable tool output streaming
             # yield context
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 9b79e50c32085..06efb43ecb7b8 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -48,7 +48,6 @@ from openai.types.responses.response_output_text import Logprob, LogprobTopLogpr
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
-from openai.types.responses.tool import Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -94,7 +93,11 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
+from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
+    convert_tool_responses_to_completions_format,
+    extract_tool_types,
+)
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -108,23 +111,6 @@ from vllm.utils import random_uuid
 logger = init_logger(__name__)
 
 
-def extract_tool_types(tools: list[Tool]) -> set[str]:
-    """
-    Extracts the tool types from the given tools.
-    """
-    tool_types: set[str] = set()
-    for tool in tools:
-        if tool.type == "mcp":
-            # Allow the MCP Tool type to enable built in tools if the
-            # server_label is allowlisted in
-            # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
-            if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
-                tool_types.add(tool.server_label)
-        else:
-            tool_types.add(tool.type)
-    return tool_types
-
-
 class OpenAIServingResponses(OpenAIServing):
     def __init__(
         self,
@@ -513,7 +499,10 @@ class OpenAIServingResponses(OpenAIServing):
         ):
             tool_dicts = None
         else:
-            tool_dicts = [tool.model_dump() for tool in request.tools]
+            tool_dicts = [
+                convert_tool_responses_to_completions_format(tool.model_dump())
+                for tool in request.tools
+            ]
         # Construct the input messages.
         messages = self._construct_input_messages(request, prev_response)
         _, request_prompts, engine_prompts = await self._preprocess_chat(
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 6eb7c0b70a670..d966f58804b67 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,7 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.tool import Tool
 
+from vllm import envs
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
@@ -43,3 +45,33 @@ def construct_chat_message_with_tool_call(
             tool_call_id=item.get("call_id"),
         )
     return item  # type: ignore
+
+
+def extract_tool_types(tools: list[Tool]) -> set[str]:
+    """
+    Extracts the tool types from the given tools.
+    """
+    tool_types: set[str] = set()
+    for tool in tools:
+        if tool.type == "mcp":
+            # Allow the MCP Tool type to enable built in tools if the
+            # server_label is allowlisted in
+            # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
+            if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
+                tool_types.add(tool.server_label)
+        else:
+            tool_types.add(tool.type)
+    return tool_types
+
+
+def convert_tool_responses_to_completions_format(tool: dict) -> dict:
+    """
+    Convert a flat tool schema:
+        {"type": "function", "name": "...", "description": "...", "parameters": {...}}
+    into:
+        {"type": "function", "function": {...}}
+    """
+    return {
+        "type": "function",
+        "function": tool,
+    }
diff --git a/vllm/envs.py b/vllm/envs.py
index 52178e5f52500..0530938c32f9e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_NCCL_SO_PATH: str | None = None
     LD_LIBRARY_PATH: str | None = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: int | None = None
     LOCAL_RANK: int = 0
@@ -100,7 +100,6 @@ if TYPE_CHECKING:
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_DISABLE_PYNCCL: bool = False
-    VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
@@ -159,6 +158,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -200,7 +200,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
-    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
+    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
@@ -222,6 +222,7 @@ if TYPE_CHECKING:
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
+    VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_FLAT_LOGPROBS: bool = False
 
@@ -521,9 +522,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
     # library file in the locations specified by `LD_LIBRARY_PATH`
     "LD_LIBRARY_PATH": lambda: os.environ.get("LD_LIBRARY_PATH", None),
-    # flag to control if vllm should use triton flash attention
-    "VLLM_USE_TRITON_FLASH_ATTN": lambda: (
-        os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")
+    # flag to control the chunk size (in MB) for sleeping memory allocations under ROCm
+    "VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int(
+        os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256")
     ),
     # Use separate prefill and decode kernels for V1 attention instead of
     # the unified triton kernel.
@@ -626,14 +627,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     # - "FLASHINFER_MLA": use FlashInfer for MLA
     # - "CUTLASS_MLA": use CUTLASS for MLA
-    # All possible options loaded dynamically from _Backend enum
+    # All possible options loaded dynamically from AttentionBackendEnum
     "VLLM_ATTENTION_BACKEND": env_with_choices(
         "VLLM_ATTENTION_BACKEND",
         None,
         lambda: list(
             __import__(
-                "vllm.attention.backends.registry", fromlist=["_Backend"]
-            )._Backend.__members__.keys()
+                "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"]
+            ).AttentionBackendEnum.__members__.keys()
         ),
     ),
     # If set, vllm will use flashinfer sampler
@@ -829,9 +830,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Note that it must be an absolute path.
     "VLLM_TORCH_PROFILER_DIR": lambda: (
         None
-        if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None
-        else os.path.abspath(
-            os.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))
+        if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
+        else (
+            val
+            if val.startswith("gs://") and val[5:] and val[5] != "/"
+            else os.path.abspath(os.path.expanduser(val))
         )
     ),
     # Enable torch profiler to record shapes if set
@@ -883,8 +886,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_DISABLE_PYNCCL": lambda: (
         os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
     ),
-    # If set, use the V1 code path.
-    "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
     # Disable aiter ops unless specifically enabled.
     # Acts as a parent switch to enable the rest of the other operations.
     "VLLM_ROCM_USE_AITER": lambda: (
@@ -1237,6 +1238,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
         "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
     ),
+    # Control the workspace buffer size for the FlashInfer backend.
+    "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
+        os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024))
+    ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
     # the blockscale tensor of activations NVFP4 Quantization.
@@ -1384,7 +1389,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Whether to use pytorch symmetric memory for allreduce
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
-        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
+        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
     ),
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
@@ -1472,6 +1477,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
         int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
     ),
+    # Limits when we run shared_experts in a separate stream.
+    # We found out that for large batch sizes, the separate stream
+    # execution is not beneficial (most likely because of the input clone)
+    # TODO(alexm-redhat): Tune to be more dynamic based on GPU type
+    "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
+        int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
+    ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
     #     Safe for multiple vllm serve processes accessing the same torch compile cache.
@@ -1533,16 +1545,6 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def set_vllm_use_v1(use_v1: bool):
-    if is_set("VLLM_USE_V1"):
-        raise ValueError(
-            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
-            "explicitly by the user. Please raise this as a Github "
-            "Issue and explicitly set VLLM_USE_V1=0 or 1."
-        )
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-
-
 def compute_hash() -> str:
     """
     WARNING: Whenever a new key is added to this environment
@@ -1562,7 +1564,6 @@ def compute_hash() -> str:
         "VLLM_PP_LAYER_PARTITION",
         "VLLM_MLA_DISABLE",
         "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_FLASH_ATTN",
         "VLLM_USE_TRITON_AWQ",
         "VLLM_DP_RANK",
         "VLLM_DP_SIZE",
@@ -1583,6 +1584,7 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
+        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
         "VLLM_USE_CUDNN_PREFILL",
         "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
         "VLLM_USE_TRTLLM_ATTENTION",
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index ef37cf862c9fe..44bc2a4cda311 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,7 +5,7 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import torch
 
@@ -185,18 +185,13 @@ class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
     """
-    Type AttentionMetadata for v0, 
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
     Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: Union[
-        "AttentionMetadata",
-        dict[str, "AttentionMetadata"],
-        list[dict[str, "AttentionMetadata"]],
-    ]
+    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -324,14 +319,7 @@ def set_forward_context(
     finally:
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = (
-                    attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
-                )
-            else:
-                # for v1 attention backends
-                batchsize = num_tokens
+            batchsize = num_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index dadb9e25ba2f1..8fb3efa220f6d 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -25,7 +25,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     modular_triton_fused_moe,
     try_get_optimal_moe_config,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoEModularMethod
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 6d6de2529de3d..893972144e99a 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -26,7 +26,7 @@ def _get_ptr(lora_weights: list[torch.Tensor], device: torch.device):
     tensor_ptrs = []
     for lora_weight in lora_weights:
         tensor_ptrs.append(lora_weight.data_ptr())
-    ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64)
 
     _LORA_PTR_DICT[key] = ptr_tensor
     return _LORA_PTR_DICT.get(key)
@@ -85,6 +85,7 @@ def _fused_moe_lora_kernel(
     GROUP_SIZE_M: tl.constexpr,
     SPLIT_K: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
     IS_PRIMARY: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 7f7d70cdc3a4a..311c4b1918597 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -48,6 +48,7 @@ def _lora_expand_kernel(
     SLICE_NUM: tl.constexpr,
     SAME_STRIDE: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index e78379cf684af..71bd5e3614667 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -46,6 +46,7 @@ def _lora_shrink_kernel(
     GROUP_SIZE_M: tl.constexpr,
     SLICE_NUM: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 65babd10a948b..746a543ab827d 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -4,6 +4,7 @@ import contextlib
 import os
 from collections import namedtuple
 from collections.abc import Callable
+from functools import cache
 from typing import Any
 
 import torch
@@ -857,6 +858,7 @@ def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
     return AttentionBlockSize(block_m=16, block_n=16)
 
 
+@cache
 def vllm_is_batch_invariant():
     env_key = "VLLM_BATCH_INVARIANT"
     is_overridden = False
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index cb31045971bd8..53d98d0650b43 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -5,9 +5,11 @@ from contextlib import contextmanager
 from typing import Any
 
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
-    FusedMoEMethodBase,
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
new file mode 100644
index 0000000000000..2dd625054339c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.distributed import (
+    get_ep_group,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.platforms import current_platform
+from vllm.utils.import_utils import has_deep_ep, has_pplx
+
+if current_platform.is_cuda_alike():
+    if has_pplx():
+        from .pplx_prepare_finalize import (
+            PplxPrepareAndFinalize,
+            pplx_hidden_dim_scale_bytes,
+        )
+    if has_deep_ep():
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import (
+            DEEPEP_QUANT_BLOCK_SHAPE,
+            DeepEPLLPrepareAndFinalize,
+        )
+
+
+def maybe_roundup_layer_hidden_size(
+    hidden_size: int,
+    act_dtype: torch.dtype,
+    moe_parallel_config: FusedMoEParallelConfig,
+) -> int:
+    """
+    Given layer hidden size and MoE configurations, round up hidden_size
+    if necessary.
+
+    Args:
+        hidden_size: Layer hidden-size
+        act_dtype: Data type of the layer activations.
+        moe_parallel_config: Fused MoE parallelization strategy configuration.
+
+    Return:
+        Rounded up hidden_size if rounding up is required based on the configs
+        and all2all backend.
+        Original hidden size otherwise.
+    """
+    if moe_parallel_config.use_deepep_ht_kernels:
+        hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size, act_dtype
+        )
+
+    if moe_parallel_config.use_deepep_ll_kernels:
+        hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size
+        )
+
+    return hidden_size
+
+
+def maybe_make_prepare_finalize(
+    moe: FusedMoEConfig,
+    quant_config: FusedMoEQuantConfig | None,
+) -> FusedMoEPrepareAndFinalize | None:
+    if not moe.moe_parallel_config.use_all2all_kernels:
+        return None
+
+    all2all_manager = get_ep_group().device_communicator.all2all_manager
+    assert all2all_manager is not None
+
+    prepare_finalize: FusedMoEPrepareAndFinalize | None = None
+
+    # TODO: could allow this now
+    assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
+
+    if moe.use_pplx_kernels:
+        assert quant_config is not None
+
+        hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
+            moe.max_num_tokens,
+            moe.hidden_dim,
+            moe.in_dtype,
+            quant_config.quant_dtype,
+            per_act_token_quant=quant_config.per_act_token_quant,
+            block_shape=quant_config.block_shape,
+        )
+
+        all_to_all_args = dict(
+            max_num_tokens=moe.max_num_tokens,
+            num_experts=moe.num_experts,
+            experts_per_token=moe.experts_per_token,  # topk
+            rank=all2all_manager.rank,
+            world_size=all2all_manager.world_size,
+            # dp_size actually means tp_size, bug in pplx kernels
+            dp_size=all2all_manager.tp_group.world_size,
+            hidden_dim=moe.hidden_dim,
+            hidden_dim_bytes=hidden_dim_bytes,
+            hidden_dim_scale_bytes=hidden_scale_bytes,
+        )
+
+        num_dispatchers = (
+            all2all_manager.world_size // all2all_manager.tp_group.world_size
+        )
+
+        # Intranode pplx a2a takes a group name while internode does not.
+        if not all2all_manager.internode:
+            all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
+
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        prepare_finalize = PplxPrepareAndFinalize(
+            handle,
+            max_num_tokens=moe.max_num_tokens,
+            num_local_experts=moe.num_local_experts,
+            num_dispatchers=num_dispatchers,
+        )
+    elif moe.use_deepep_ht_kernels:
+        assert moe.dp_size == all2all_manager.dp_world_size
+
+        all_to_all_args = dict()
+        handle = all2all_manager.get_handle(all_to_all_args)
+        prepare_finalize = DeepEPHTPrepareAndFinalize(
+            handle,
+            num_dispatchers=all2all_manager.world_size,
+            dp_size=all2all_manager.dp_world_size,
+            rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
+        )
+
+    elif moe.use_deepep_ll_kernels:
+        assert quant_config is not None
+        all_to_all_args = dict(
+            max_num_tokens_per_dp_rank=moe.max_num_tokens,
+            token_hidden_size=moe.hidden_dim,
+            num_ep_ranks=all2all_manager.world_size,
+            num_global_experts=moe.num_experts,
+            num_local_experts=moe.num_experts // all2all_manager.world_size,
+        )
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        # Note: We may want to use FP8 dispatch just to reduce
+        # data movement.
+        use_fp8_dispatch = (
+            quant_config.quant_dtype == current_platform.fp8_dtype()
+            and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
+        )
+
+        prepare_finalize = DeepEPLLPrepareAndFinalize(
+            handle,
+            max_tokens_per_rank=moe.max_num_tokens,
+            num_dispatchers=all2all_manager.world_size,
+            use_fp8_dispatch=use_fp8_dispatch,
+        )
+
+    return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index b8a97e92ab790..869082f8231d1 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -232,6 +232,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         """
         super().__init__(quant_config)
         assert self.block_shape == get_mk_alignment_for_contiguous_layout()
+        assert self.quant_config.use_fp8_w8a8
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
@@ -250,6 +251,12 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def supports_expert_map(self) -> bool:
         return False
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
+        """
+        return current_platform.is_device_capability(100)
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
         return TopKWeightAndReduceDelegate()
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 500bcefcfaa92..06c9df317f7c7 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -6,6 +6,7 @@ import deep_ep
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
@@ -20,6 +21,8 @@ from vllm.v1.worker.ubatching import (
     dbo_maybe_run_recv_hook,
 )
 
+logger = init_logger(__name__)
+
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
@@ -94,6 +97,29 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        # We don't have enough information to determine if we should dispatch
+        # activation scales in a packed ue8m0 format during object construction
+        # time. This setting is handled by post_init_setup.
+        self.use_ue8m0_dispatch = False
+
+    def post_init_setup(self, fused_experts: mk.FusedMoEPermuteExpertsUnpermute):
+        if not fused_experts.supports_packed_ue8m0_act_scales():
+            # Early exit.
+            return
+
+        if self.use_fp8_dispatch:
+            logger.debug_once(
+                "Update DeepEPLLPrepareFinalize to do packed ue8m0 scales dispatch."
+            )
+            self.use_ue8m0_dispatch = True
+        else:
+            logger.warning_once(
+                "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
+                f"activations despite ({fused_experts.__class__.__name__}) being able "
+                "to support quantized activations.",
+                scope="local",
+            )
+
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
@@ -206,6 +232,9 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
+            # round_scale needs to be set to dispatch in ue8m0
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
             async_finish=False,
             return_recv_hook=True,
         )
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 3b0df6c416a04..0b0f59f673182 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -499,11 +499,35 @@ def batched_fused_marlin_moe(
 
 
 class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
-    def __init__(self, quant_config: FusedMoEQuantConfig):
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
+            "Supports only mxfp4_w4a16 or int4_w4a16"
+        )
+        self.w13_g_idx = w13_g_idx
+        self.w2_g_idx = w2_g_idx
+        self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
+        self.w2_g_idx_sort_indices = w2_g_idx_sort_indices
+        self.is_k_full = is_k_full
         super().__init__(quant_config)
 
+    @property
+    def quant_type_id(self) -> int:
+        # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
+        return (
+            scalar_types.uint4b8.id
+            if self.quant_config.use_int4_w4a16
+            else scalar_types.float4_e2m1f.id
+        )
+
     def moe_problem_size(
         self,
         a1: torch.Tensor,
@@ -533,8 +557,23 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
 
 
 class MarlinExperts(MarlinExpertsBase):
-    def __init__(self, quant_config: FusedMoEQuantConfig):
-        super().__init__(quant_config)
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
 
     def supports_expert_map(self) -> bool:
         return True
@@ -616,7 +655,7 @@ class MarlinExperts(MarlinExpertsBase):
             gating_output=None,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            quant_type_id=scalar_types.float4_e2m1f.id,  # works only for w4a16
+            quant_type_id=self.quant_type_id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             activation=activation,
@@ -628,6 +667,11 @@ class MarlinExperts(MarlinExpertsBase):
             # output buffer allocation. Please refer to workspace_shapes().
             intermediate_cache13=workspace2,
             intermediate_cache2=workspace13,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
         )
 
     def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
@@ -650,8 +694,20 @@ class BatchedMarlinExperts(MarlinExpertsBase):
         max_num_tokens: int,
         num_dispatchers: int,
         quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
     ):
-        super().__init__(quant_config)
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
@@ -720,7 +776,7 @@ class BatchedMarlinExperts(MarlinExpertsBase):
             w1_scale=self.w1_scale,
             w2_scale=self.w2_scale,
             gating_output=None,
-            quant_type_id=scalar_types.float4_e2m1f.id,  # works only for w4a16
+            quant_type_id=self.quant_type_id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             activation=activation,
@@ -728,4 +784,9 @@ class BatchedMarlinExperts(MarlinExpertsBase):
             output=output,
             intermediate_cache13=workspace13,
             intermediate_cache2=workspace2,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
         )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
new file mode 100644
index 0000000000000..87f8c8d75a9b5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase,
+)
+
+logger = init_logger(__name__)
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.moe: FusedMoEConfig = moe
+        self.moe_quant_config: FusedMoEQuantConfig | None = None
+
+    @abstractmethod
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        from .all2all_utils import maybe_make_prepare_finalize
+
+        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        # based on the all2all implementation, select the appropriate
+        # gemm implementation
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must select appropriate gemm "
+            "implementation based on the prepare_finalize"
+        )
+
+    @abstractmethod
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        raise NotImplementedError
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
+
+    @property
+    def allow_inplace(self) -> bool:
+        return False
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
new file mode 100644
index 0000000000000..43974ba917e42
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+    FusedMoEPrepareAndFinalize,
+)
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("modular_fused_moe")
+class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    def __init__(
+        self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
+    ):
+        super().__init__(old_quant_method.moe)
+        self.moe_quant_config = old_quant_method.moe_quant_config
+        self.fused_experts = experts
+        self.disable_expert_map = getattr(
+            old_quant_method,
+            "disable_expert_map",
+            not self.fused_experts.supports_expert_map(),
+        )
+        self.old_quant_method = old_quant_method
+        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
+
+    @staticmethod
+    def make(
+        moe_layer: torch.nn.Module,
+        old_quant_method: FusedMoEMethodBase,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        shared_experts: torch.nn.Module | None,
+    ) -> "FusedMoEModularMethod":
+        return FusedMoEModularMethod(
+            old_quant_method,
+            FusedMoEModularKernel(
+                prepare_finalize,
+                old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
+                shared_experts,
+            ),
+        )
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
+
+    @property
+    def supports_eplb(self) -> bool:
+        return self.old_quant_method.supports_eplb
+
+    @property
+    def allow_inplace(self) -> bool:
+        return self.old_quant_method.allow_inplace
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return self.moe_quant_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        # Is getattr needed?
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        if enable_eplb:
+            if self.supports_eplb:
+                assert expert_load_view is not None
+                assert logical_to_physical_map is not None
+                assert logical_replica_count is not None
+            else:
+                raise NotImplementedError(
+                    "EPLB is not supported for "
+                    f"{self.old_quant_method.__class__.__name__}."
+                )
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+        )
+
+        result = self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=self.allow_inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else expert_map,
+        )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 39547cc83c7b6..aed8245cbd830 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from abc import abstractmethod
 from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
@@ -27,17 +26,13 @@ from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEConfig,
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
-    biased_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat,
-    FusedMoEModularKernel,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
 )
@@ -47,35 +42,21 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
 from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
-    QuantizeMethodBase,
 )
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     is_flashinfer_supporting_global_sf,
 )
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.platforms.interface import CpuArchEnum
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-from vllm.utils.import_utils import has_deep_ep, has_pplx
 from vllm.utils.math_utils import cdiv, round_up
-from vllm.utils.torch_utils import current_stream, direct_register_custom_op
+from vllm.utils.torch_utils import (
+    aux_stream,
+    current_stream,
+    direct_register_custom_op,
+)
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import TritonExperts, eplb_map_to_physical_and_record, fused_experts
-
-    if has_pplx():
-        from .pplx_prepare_finalize import (
-            PplxPrepareAndFinalize,
-            pplx_hidden_dim_scale_bytes,
-        )
-    if has_deep_ep():
-        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
-        from .deepep_ll_prepare_finalize import (
-            DEEPEP_QUANT_BLOCK_SHAPE,
-            DeepEPLLPrepareAndFinalize,
-        )
+    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = object  # type: ignore
@@ -102,6 +83,16 @@ if current_platform.is_tpu():
 else:
     fused_moe_pallas = None  # type: ignore
 
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
+
 logger = init_logger(__name__)
 
 
@@ -112,885 +103,6 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
-class FusedMoEMethodBase(QuantizeMethodBase):
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
-        self.moe: FusedMoEConfig = moe
-        self.moe_quant_config: FusedMoEQuantConfig | None = None
-
-    @abstractmethod
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        raise NotImplementedError
-
-    def uses_weight_scale_2_pattern(self) -> bool:
-        """
-        Returns True if this quantization method uses 'weight_scale_2' pattern
-        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
-
-        This method should be overridden by subclasses that use the
-        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
-        """
-        return False
-
-    @staticmethod
-    def _maybe_make_prepare_finalize(
-        moe: FusedMoEConfig,
-        quant_config: FusedMoEQuantConfig | None,
-    ) -> FusedMoEPrepareAndFinalize | None:
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-
-        prepare_finalize: FusedMoEPrepareAndFinalize | None = None
-
-        # TODO: could allow this now
-        assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
-
-        if moe.use_pplx_kernels:
-            assert quant_config is not None
-
-            hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
-                moe.max_num_tokens,
-                moe.hidden_dim,
-                moe.in_dtype,
-                quant_config.quant_dtype,
-                per_act_token_quant=quant_config.per_act_token_quant,
-                block_shape=quant_config.block_shape,
-            )
-
-            all_to_all_args = dict(
-                max_num_tokens=moe.max_num_tokens,
-                num_experts=moe.num_experts,
-                experts_per_token=moe.experts_per_token,  # topk
-                rank=all2all_manager.rank,
-                world_size=all2all_manager.world_size,
-                # dp_size actually means tp_size, bug in pplx kernels
-                dp_size=all2all_manager.tp_group.world_size,
-                hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=hidden_dim_bytes,
-                hidden_dim_scale_bytes=hidden_scale_bytes,
-            )
-
-            num_dispatchers = (
-                all2all_manager.world_size // all2all_manager.tp_group.world_size
-            )
-
-            # Intranode pplx a2a takes a group name while internode does not.
-            if not all2all_manager.internode:
-                all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
-
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            prepare_finalize = PplxPrepareAndFinalize(
-                handle,
-                max_num_tokens=moe.max_num_tokens,
-                num_local_experts=moe.num_local_experts,
-                num_dispatchers=num_dispatchers,
-            )
-        elif moe.use_deepep_ht_kernels:
-            assert moe.dp_size == all2all_manager.dp_world_size
-
-            all_to_all_args = dict()
-            handle = all2all_manager.get_handle(all_to_all_args)
-            prepare_finalize = DeepEPHTPrepareAndFinalize(
-                handle,
-                num_dispatchers=all2all_manager.world_size,
-                dp_size=all2all_manager.dp_world_size,
-                rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
-            )
-
-        elif moe.use_deepep_ll_kernels:
-            assert quant_config is not None
-            all_to_all_args = dict(
-                max_num_tokens_per_dp_rank=moe.max_num_tokens,
-                token_hidden_size=moe.hidden_dim,
-                num_ep_ranks=all2all_manager.world_size,
-                num_global_experts=moe.num_experts,
-                num_local_experts=moe.num_experts // all2all_manager.world_size,
-            )
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            # Note: We may want to use FP8 dispatch just to reduce
-            # data movement.
-            use_fp8_dispatch = (
-                quant_config.quant_dtype == current_platform.fp8_dtype()
-                and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
-            )
-
-            prepare_finalize = DeepEPLLPrepareAndFinalize(
-                handle,
-                max_tokens_per_rank=moe.max_num_tokens,
-                num_dispatchers=all2all_manager.world_size,
-                use_fp8_dispatch=use_fp8_dispatch,
-            )
-
-        return prepare_finalize
-
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
-        if self.moe.moe_parallel_config.use_all2all_kernels:
-            return FusedMoEMethodBase._maybe_make_prepare_finalize(
-                self.moe, self.moe_quant_config
-            )
-        else:
-            return None
-
-    def maybe_init_modular_kernel(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEModularKernel | None:
-        assert self.moe is not None
-
-        # We must get the quant config here so that the layer is
-        # completely initialized, i.e. all weights loaded and post
-        # processed.
-        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-
-        prepare_finalize = self.maybe_make_prepare_finalize()
-
-        if prepare_finalize is not None:
-            logger.debug(
-                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
-            )
-            experts = self.select_gemm_impl(prepare_finalize, layer)
-            return FusedMoEModularKernel(
-                prepare_finalize,
-                experts,
-                layer.shared_experts,
-            )
-        else:
-            return None
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        # based on the all2all implementation, select the appropriate
-        # gemm implementation
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must select appropriate gemm "
-            "implementation based on the prepare_finalize"
-        )
-
-    @abstractmethod
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        raise NotImplementedError
-
-    @property
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return None
-
-    @property
-    def supports_eplb(self) -> bool:
-        return False
-
-    @property
-    def allow_inplace(self) -> bool:
-        return False
-
-    @abstractmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError
-
-
-@CustomOp.register("modular_fused_moe")
-class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
-    def __init__(
-        self,
-        old_quant_method: FusedMoEMethodBase,
-        fused_experts: FusedMoEModularKernel,
-    ):
-        super().__init__(old_quant_method.moe)
-        # Find better way to copy attributes?  Should we even copy attributes?
-        # self.__dict__.update(old_quant_method.__dict__)
-        self.moe_quant_config = old_quant_method.moe_quant_config
-        self.fused_experts = fused_experts
-        self.disable_expert_map = getattr(
-            old_quant_method,
-            "disable_expert_map",
-            not fused_experts.supports_expert_map(),
-        )
-        self.old_quant_method = old_quant_method
-        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
-
-    @property
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return self.fused_experts.prepare_finalize.topk_indices_dtype()
-
-    @property
-    def supports_eplb(self) -> bool:
-        return self.old_quant_method.supports_eplb
-
-    @property
-    def allow_inplace(self) -> bool:
-        return self.old_quant_method.allow_inplace
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        raise NotImplementedError
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        return self.moe_quant_config
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-                assert isinstance(layer, FusedMoE)
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
-        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-        )
-
-        result = self.fused_experts(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=self.allow_inplace,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=None if self.disable_expert_map else expert_map,
-        )
-
-        if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
-
-
-@CustomOp.register("unquantized_fused_moe")
-class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-    """MoE method without quantization."""
-
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-
-        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        if self.rocm_aiter_moe_enabled:
-            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
-
-            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
-        else:
-            self.rocm_aiter_fused_experts = None  # type: ignore
-
-        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
-        self.flashinfer_cutlass_moe_enabled = (
-            has_flashinfer_cutlass_fused_moe()
-            and envs.VLLM_USE_FLASHINFER_MOE_FP16
-            and self.moe.moe_parallel_config.use_ep
-            and self.moe.moe_parallel_config.dp_size == 1
-            and current_platform.get_device_capability()[0] >= 9
-        )
-        if self.flashinfer_cutlass_moe_enabled:
-            logger.info_once(
-                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
-            )
-            from functools import partial
-
-            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
-
-            self.flashinfer_cutlass_moe = partial(
-                flashinfer_cutlass_moe,
-                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
-                tp_rank=self.moe.moe_parallel_config.tp_rank,
-                tp_size=self.moe.moe_parallel_config.tp_size,
-                ep_rank=self.moe.moe_parallel_config.ep_rank,
-                ep_size=self.moe.moe_parallel_config.ep_size,
-            )
-        else:
-            if (
-                self.moe.moe_parallel_config.use_ep
-                and self.moe.moe_parallel_config.dp_size == 1
-            ):
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is available for EP"
-                    " but not enabled, consider setting"
-                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
-                    scope="local",
-                )
-            elif self.moe.moe_parallel_config.dp_size > 1:
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is currently not available for DP.",
-                    scope="local",
-                )
-            self.flashinfer_cutlass_moe = None  # type: ignore
-
-    @property
-    def supports_eplb(self) -> bool:
-        return True
-
-    @property
-    def allow_inplace(self) -> bool:
-        return True
-
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
-        if self.rocm_aiter_moe_enabled:
-            return None
-        else:
-            return super().maybe_make_prepare_finalize()
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        assert self.moe_quant_config is not None
-        if (
-            prepare_finalize.activation_format
-            == FusedMoEActivationFormat.BatchedExperts
-        ):
-            logger.debug("BatchedTritonExperts %s", self.moe)
-            return BatchedTritonExperts(
-                max_num_tokens=self.moe.max_num_tokens,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                quant_config=self.moe_quant_config,
-            )
-        else:
-            logger.debug("TritonExperts %s", self.moe)
-            return TritonExperts(self.moe_quant_config)
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        if self.moe.is_act_and_mul:
-            w13_up_dim = 2 * intermediate_size_per_partition
-        else:
-            w13_up_dim = intermediate_size_per_partition
-        # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                w13_up_dim,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w13_bias = torch.nn.Parameter(
-                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
-                requires_grad=False,
-            )
-            layer.register_parameter("w13_bias", w13_bias)
-            set_weight_attrs(w13_bias, extra_weight_attrs)
-        # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w2_bias = torch.nn.Parameter(
-                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
-                requires_grad=False,
-            )
-            layer.register_parameter("w2_bias", w2_bias)
-            set_weight_attrs(w2_bias, extra_weight_attrs)
-
-    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        # Pad the weight tensor. This is an optimization on ROCm platform, which
-        # can benefit from tensors located far enough from one another in memory
-        if (
-            envs.VLLM_ROCM_MOE_PADDING
-            and current_platform.is_rocm()
-            and weight.stride(-1) == 1
-            and (weight.stride(-2) * weight.element_size()) % 512 == 0
-        ):
-            num_pad = 256 // weight.element_size()
-            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
-
-        return weight
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        super().process_weights_after_loading(layer)
-
-        # Padding the weight for better performance on ROCm
-        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
-        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
-
-        if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data
-            )
-
-            layer.w13_weight.data = shuffled_w13
-            layer.w2_weight.data = shuffled_w2
-
-        if self.flashinfer_cutlass_moe_enabled:
-            # Swap halves to arrange as [w3; w1] (kernel expectation)
-            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
-            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
-            layer.w13_weight.data = w13_weight_swapped.contiguous()
-
-        if current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-
-            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
-            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-                layer.w13_weight,
-                layer.w2_weight,
-                use_prepack=True,
-                experts_start_id=ep_rank_start,
-            )
-        elif current_platform.is_cpu():
-            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-
-            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
-
-                dtype_w13 = layer.w13_weight.dtype
-                _, n_w13, k_w13 = layer.w13_weight.size()
-                dtype_w2 = layer.w2_weight.dtype
-                _, n_w2, k_w2 = layer.w2_weight.size()
-                if (
-                    envs.VLLM_CPU_SGL_KERNEL
-                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
-                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
-                ):
-                    packed_w13_weight = torch.ops._C.convert_weight_packed(
-                        layer.w13_weight
-                    )
-                    assert packed_w13_weight.size() == layer.w13_weight.size()
-                    layer.w13_weight.copy_(packed_w13_weight)
-                    del packed_w13_weight
-                    packed_w2_weight = torch.ops._C.convert_weight_packed(
-                        layer.w2_weight
-                    )
-                    assert packed_w2_weight.size() == layer.w2_weight.size()
-                    layer.w2_weight.copy_(packed_w2_weight)
-                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
-                else:
-                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
-            else:
-                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        return self.forward(
-            x=x,
-            layer=layer,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            enable_eplb=enable_eplb,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-        )
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        if self.moe.has_bias:
-            return biased_moe_quant_config(
-                layer.w13_bias,
-                layer.w2_bias,
-            )
-        else:
-            return FUSED_MOE_UNQUANTIZED_CONFIG
-
-    def forward_cuda(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-        )
-
-        if self.rocm_aiter_moe_enabled:
-            result = self.rocm_aiter_fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                expert_map=expert_map,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-        elif self.flashinfer_cutlass_moe_enabled:
-            return self.flashinfer_cutlass_moe(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-        else:
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-            )
-
-        if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
-
-    def forward_cpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for CPU.")
-        return layer.cpu_fused_moe(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            global_num_experts,
-            expert_map,
-            custom_routing_function,
-            scoring_func,
-            routed_scaling_factor,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            activation,
-        )
-
-    def forward_xpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for XPU.")
-        return layer.ipex_fusion(
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
-        )
-
-    def forward_tpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax scoring function is supported for TPU."
-            )
-        if e_score_correction_bias is not None:
-            raise NotImplementedError(
-                "Expert score correction bias is not supported for TPU."
-            )
-        assert activation == "silu", f"{activation} is not supported for TPU."
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
-        )
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for TPU.")
-        return fused_moe_pallas(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk=top_k,
-            gating_output=router_logits,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            renormalize=renormalize,
-        )
-
-    if current_platform.is_tpu():
-        forward_native = forward_tpu
-    elif current_platform.is_cpu():
-        forward_native = forward_cpu
-    elif current_platform.is_xpu():
-        forward_native = forward_xpu
-    else:
-        forward_native = forward_cuda
-
-
 def determine_expert_map(
     ep_size: int,
     ep_rank: int,
@@ -1125,16 +237,13 @@ def maybe_roundup_hidden_size(
         Rounded up hidden_size if rounding up is required based on the configs.
         Original hidden size otherwise.
     """
+    from vllm.model_executor.layers.fused_moe.all2all_utils import (
+        maybe_roundup_layer_hidden_size,
+    )
 
-    if moe_parallel_config.use_deepep_ht_kernels:
-        hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
-            hidden_size, act_dtype
-        )
-
-    if moe_parallel_config.use_deepep_ll_kernels:
-        hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
-            hidden_size
-        )
+    hidden_size = maybe_roundup_layer_hidden_size(
+        hidden_size, act_dtype, moe_parallel_config
+    )
 
     # we are padding globally so EP buffer allocation works
     if quant_config and quant_config.get_name() == "mxfp4":
@@ -1226,7 +335,11 @@ class FusedMoE(CustomOp):
             logger.info_once("Disabling MoE shared_experts cuda stream")
             self.shared_experts_stream = None
         else:
-            self.shared_experts_stream = torch.cuda.Stream()
+            # TODO(rob): enable shared expert overlap with non-cuda.
+            # aux_stream() returns None on non-cuda platforms.
+            self.shared_experts_stream = aux_stream()
+            if self.shared_experts_stream is not None:
+                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -1430,7 +543,6 @@ class FusedMoE(CustomOp):
             is_lora_enabled=vllm_config.lora_config is not None,
         )
 
-        self.moe_quant_config: FusedMoEQuantConfig | None = None
         self.quant_config = quant_config
 
         def _get_quant_method() -> FusedMoEMethodBase:
@@ -1508,9 +620,15 @@ class FusedMoE(CustomOp):
     # This is called after all weight loading and post-processing, so it
     # should be safe to swap out the quant_method.
     def maybe_init_modular_kernel(self) -> None:
-        mk = self.quant_method.maybe_init_modular_kernel(self)
-        if mk is not None:
-            self.quant_method = FusedMoEModularMethod(self.quant_method, mk)
+        self.ensure_moe_quant_config_init()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        if prepare_finalize is not None:
+            logger.debug(
+                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
+            )
+            self.quant_method = FusedMoEModularMethod.make(
+                self, self.quant_method, prepare_finalize, self.shared_experts
+            )
 
     @property
     def shared_experts(self) -> torch.nn.Module | None:
@@ -1568,6 +686,10 @@ class FusedMoE(CustomOp):
             and self.moe_config.use_flashinfer_cutlass_kernels
         )
 
+    @property
+    def use_marlin_kernels(self):
+        return getattr(self.quant_method, "use_marlin", False)
+
     @property
     def use_dp_chunking(self) -> bool:
         return (
@@ -2104,7 +1226,11 @@ class FusedMoE(CustomOp):
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
         weights = list(self.named_parameters())
-        assert all(weight.is_contiguous() for _, weight in weights)
+        assert all(
+            weight.is_contiguous()
+            for name, weight in weights
+            if not name.startswith("_shared_experts.")
+        )
 
         # Filter out the non-expert weights.
         # `e_score_correction_bias` is a bias for each logical expert,
@@ -2142,12 +1268,16 @@ class FusedMoE(CustomOp):
 
     def ensure_moe_quant_config_init(self):
         if self.quant_method.moe_quant_config is None:
+            # Note: the moe_quant_config can't be constructed until after
+            # weight loading post processing.
             self.quant_method.moe_quant_config = (
                 self.quant_method.get_fused_moe_quant_config(self)
             )
 
-        if self.moe_quant_config is None:
-            self.moe_quant_config = self.quant_method.moe_quant_config
+    @property
+    def moe_quant_config(self) -> FusedMoEQuantConfig | None:
+        self.ensure_moe_quant_config_init()
+        return self.quant_method.moe_quant_config
 
     def ensure_dp_chunking_init(self):
         if not self.use_dp_chunking or self.batched_hidden_states is not None:
@@ -2484,7 +1614,9 @@ class FusedMoE(CustomOp):
             if has_separate_shared_experts:
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
+
                 shared_output = self.shared_experts(staged_hidden_states)
+
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,
@@ -2562,13 +1694,34 @@ class FusedMoE(CustomOp):
 
         use_chunked_impl = self.use_dp_chunking
 
-        if (
+        use_shared_experts_stream = (
             has_separate_shared_experts
             and not use_chunked_impl
             and self.shared_experts_stream is not None
-        ):
-            # Start the separate shared experts stream here since we want
-            # to run in parallel with the router/gate (next op below)
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
             self.shared_experts_stream.wait_stream(current_stream())
 
         # If router/gate provided, then apply it here.
@@ -2587,33 +1740,6 @@ class FusedMoE(CustomOp):
             self.quant_method, FusedMoEModularMethod
         )
 
-        # If there are shared experts but we are not using a modular kernel, the
-        # shared experts must be called here
-        if has_separate_shared_experts:
-            assert self.shared_experts is not None
-
-            if self.shared_experts_stream is not None:
-                # Clone BEFORE switching streams to avoid race condition
-                # where routed_expert kernel may mutate hidden_states.
-                hidden_states_clone = hidden_states.clone()
-                self.shared_experts_stream.wait_stream(current_stream())
-
-                # Run shared experts in parallel on a separate stream
-                with torch.cuda.stream(self.shared_experts_stream):
-                    shared_output = self.shared_experts(hidden_states_clone)
-
-                # Record that the clone will be used by shared_experts_stream
-                # to avoid gc issue from deallocation of hidden_states_clone
-                # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-                # NOTE: we dont need shared_output.record_stream(current_stream())
-                # because we synch the streams before using shared_output.
-                hidden_states_clone.record_stream(self.shared_experts_stream)
-
-            else:
-                shared_output = self.shared_experts(hidden_states)
-        else:
-            shared_output = None
-
         ctx = get_forward_context()
         sp_ctx = (
             ctx.dp_metadata.sp_local_sizes(self.sp_size)
@@ -2654,12 +1780,21 @@ class FusedMoE(CustomOp):
             )
 
             if has_separate_shared_experts:
-                assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
 
-                # Wait for the parallel shared experts stream to finish here
-                if self.shared_experts_stream is not None:
+                if use_shared_experts_stream:
+                    # Run shared experts in parallel on a separate stream
+                    # NOTE: We start the separate stream here and mark the
+                    # sync end point immediately after it is done. This is
+                    # important to avoid excessive stream allocations by the cuda
+                    # graph replay later.
+                    with torch.cuda.stream(self.shared_experts_stream):
+                        # Note that hidden_states clone() is necessary here to avoid
+                        # conflict with the main stream
+                        shared_output = self.shared_experts(hidden_states_clone)
                     current_stream().wait_stream(self.shared_experts_stream)
+                else:
+                    shared_output = self.shared_experts(hidden_states)
 
                 final_hidden_states = (
                     shared_output,
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b5fa2c71bec58..a3142f37053f9 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -149,6 +149,15 @@ class FusedMoEPrepareAndFinalize(ABC):
     described above.
     """
 
+    def post_init_setup(self, fused_experts: "FusedMoEPermuteExpertsUnpermute"):
+        """
+        Initialize FusedMoEPrepareAndFinalize settings that depend on
+        FusedMoEPermuteExpertsUnpermute experts object.
+        The FusedMoEPrepareAndFinalize implementations that have such
+        dependencies may choose to override this function.
+        """
+        return
+
     @abstractmethod
     def prepare(
         self,
@@ -503,6 +512,13 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         """
         raise NotImplementedError
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        A flag indicating whether or not this class can process packed ue8m0
+        activation scales.
+        """
+        return False
+
     def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
         """
         Workspace type: The dtype to use for the workspace tensors.
@@ -698,6 +714,8 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
+
+        self._post_init_setup()
         assert (
             prepare_finalize.activation_format == fused_experts.activation_formats[0]
         ), (
@@ -707,6 +725,13 @@ class FusedMoEModularKernel(torch.nn.Module):
             f"{fused_experts.activation_formats[0]}"
         )
 
+    def _post_init_setup(self):
+        """
+        Resolve any leftover setup dependencies between self.prepare_finalize
+        and self.fused_experts here.
+        """
+        self.prepare_finalize.post_init_setup(self.fused_experts)
+
     def supports_expert_map(self) -> bool:
         """
         A flag indicating whether or not this class supports expert maps.
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 3d0c5636d6c0a..6ec8b33ed9309 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -28,17 +28,17 @@ class SharedFusedMoE(FusedMoE):
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
 
-        # Disable shared expert overlap if we are using eplb, because of
-        # correctness issues, or if using flashinfer with DP, since there
-        # is nothing to be gained in this case. Disabling the overlap
-        # optimization also prevents the shared experts from being hidden
-        # from torch.compile.
+        # Disable shared expert overlap if:
+        #   - we are using eplb, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
+        #   - we are using marlin kjernels
         self.use_overlapped = (
             use_overlapped
             and not (
                 # TODO(wentao): find the root cause and remove this condition
                 self.enable_eplb
-                or (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or self.use_marlin_kernels
             )
             and self._shared_experts is not None
         )
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
new file mode 100644
index 0000000000000..ce56887f1c26d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -0,0 +1,578 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch.nn.functional as F
+
+import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    biased_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if current_platform.is_cuda_alike():
+    from .fused_batched_moe import BatchedTritonExperts
+    from .fused_moe import TritonExperts, fused_experts
+else:
+    fused_experts = None  # type: ignore
+
+if current_platform.is_tpu():
+    from .moe_pallas import fused_moe as fused_moe_pallas
+else:
+    fused_moe_pallas = None  # type: ignore
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("unquantized_fused_moe")
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if self.rocm_aiter_moe_enabled:
+            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+
+            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
+        else:
+            self.rocm_aiter_fused_experts = None  # type: ignore
+
+        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
+        self.flashinfer_cutlass_moe_enabled = (
+            has_flashinfer_cutlass_fused_moe()
+            and envs.VLLM_USE_FLASHINFER_MOE_FP16
+            and self.moe.moe_parallel_config.use_ep
+            and self.moe.moe_parallel_config.dp_size == 1
+            and current_platform.get_device_capability()[0] >= 9
+        )
+        if self.flashinfer_cutlass_moe_enabled:
+            logger.info_once(
+                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
+            )
+            from functools import partial
+
+            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
+
+            self.flashinfer_cutlass_moe = partial(
+                flashinfer_cutlass_moe,
+                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
+                tp_rank=self.moe.moe_parallel_config.tp_rank,
+                tp_size=self.moe.moe_parallel_config.tp_size,
+                ep_rank=self.moe.moe_parallel_config.ep_rank,
+                ep_size=self.moe.moe_parallel_config.ep_size,
+            )
+        else:
+            if (
+                self.moe.moe_parallel_config.use_ep
+                and self.moe.moe_parallel_config.dp_size == 1
+            ):
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is available for EP"
+                    " but not enabled, consider setting"
+                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
+                    scope="local",
+                )
+            elif self.moe.moe_parallel_config.dp_size > 1:
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is currently not available for DP.",
+                    scope="local",
+                )
+            self.flashinfer_cutlass_moe = None  # type: ignore
+
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        if self.rocm_aiter_moe_enabled:
+            return None
+        else:
+            return super().maybe_make_prepare_finalize()
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == FusedMoEActivationFormat.BatchedExperts
+        ):
+            logger.debug("BatchedTritonExperts %s", self.moe)
+            return BatchedTritonExperts(
+                max_num_tokens=self.moe.max_num_tokens,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+            )
+        else:
+            logger.debug("TritonExperts %s", self.moe)
+            return TritonExperts(self.moe_quant_config)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w13_up_dim,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w13_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w2_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
+
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (
+            envs.VLLM_ROCM_MOE_PADDING
+            and current_platform.is_rocm()
+            and weight.stride(-1) == 1
+            and (weight.stride(-2) * weight.element_size()) % 512 == 0
+        ):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+
+        return weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        # Padding the weight for better performance on ROCm
+        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
+
+        if self.rocm_aiter_moe_enabled:
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data
+            )
+
+            layer.w13_weight.data = shuffled_w13
+            layer.w2_weight.data = shuffled_w2
+
+        if self.flashinfer_cutlass_moe_enabled:
+            # Swap halves to arrange as [w3; w1] (kernel expectation)
+            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
+            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
+            layer.w13_weight.data = w13_weight_swapped.contiguous()
+
+        if current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+
+            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                layer.w13_weight,
+                layer.w2_weight,
+                use_prepack=True,
+                experts_start_id=ep_rank_start,
+            )
+        elif current_platform.is_cpu():
+            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
+
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
+
+                dtype_w13 = layer.w13_weight.dtype
+                _, n_w13, k_w13 = layer.w13_weight.size()
+                dtype_w2 = layer.w2_weight.dtype
+                _, n_w2, k_w2 = layer.w2_weight.size()
+                if (
+                    envs.VLLM_CPU_SGL_KERNEL
+                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
+                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
+                ):
+                    packed_w13_weight = torch.ops._C.convert_weight_packed(
+                        layer.w13_weight
+                    )
+                    assert packed_w13_weight.size() == layer.w13_weight.size()
+                    layer.w13_weight.copy_(packed_w13_weight)
+                    del packed_w13_weight
+                    packed_w2_weight = torch.ops._C.convert_weight_packed(
+                        layer.w2_weight
+                    )
+                    assert packed_w2_weight.size() == layer.w2_weight.size()
+                    layer.w2_weight.copy_(packed_w2_weight)
+                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
+                else:
+                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+            else:
+                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            enable_eplb=enable_eplb,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        if self.moe.has_bias:
+            return biased_moe_quant_config(
+                layer.w13_bias,
+                layer.w2_bias,
+            )
+        else:
+            return FUSED_MOE_UNQUANTIZED_CONFIG
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+            num_fused_shared_experts=layer.num_fused_shared_experts,
+        )
+
+        if self.rocm_aiter_moe_enabled:
+            result = self.rocm_aiter_fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                expert_map=expert_map,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        elif self.flashinfer_cutlass_moe_enabled:
+            return self.flashinfer_cutlass_moe(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        else:
+            result = fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                quant_config=self.moe_quant_config,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for CPU.")
+        return layer.cpu_fused_moe(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            global_num_experts,
+            expert_map,
+            custom_routing_function,
+            scoring_func,
+            routed_scaling_factor,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+        )
+
+    def forward_xpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for XPU.")
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )
+
+    def forward_tpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU."
+            )
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU."
+            )
+        assert activation == "silu", f"{activation} is not supported for TPU."
+        assert routed_scaling_factor == 1.0, (
+            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
+        )
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for TPU.")
+        return fused_moe_pallas(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk=top_k,
+            gating_output=router_logits,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            renormalize=renormalize,
+        )
+
+    if current_platform.is_tpu():
+        forward_native = forward_tpu
+    elif current_platform.is_cpu():
+        forward_native = forward_cpu
+    elif current_platform.is_xpu():
+        forward_native = forward_xpu
+    else:
+        forward_native = forward_cuda
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3e1f87b59a34d..3f6ea68072b40 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -424,6 +424,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
index e69de29bb2d1d..6655f89136238 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 59567f2ca13c7..06ee96d55419c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -35,7 +35,11 @@ from vllm.model_executor.layers.fused_moe.cpu_fused_moe import select_experts
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
@@ -98,9 +102,6 @@ __all__ = [
 
 
 class CompressedTensorsMoEMethod(FusedMoEMethodBase):
-    def __init_(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -1022,9 +1023,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Fp8MoEMethod` yet."
-            )
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -1040,6 +1042,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             num_fused_shared_experts=layer.num_fused_shared_experts,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1148,6 +1155,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 quant_config=self.moe_quant_config,
             )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
@@ -1345,6 +1356,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
                 f"{WNA16_SUPPORTED_BITS}",
             )
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        self.use_marlin = True
 
     def create_weights(
         self,
@@ -1570,7 +1582,51 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        return None
+        if self.num_bits != 4:
+            return None
+        return int4_w4a16_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size],
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        assert self.num_bits == 4, "only supporting w4"
+        layer.w13_weight = layer.w13_weight_packed
+        layer.w2_weight = layer.w2_weight_packed
+        assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+            assert max_num_tokens_per_rank is not None
+            return BatchedMarlinExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
+        else:
+            return MarlinExperts(
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 5480383126e3b..205abe942bd01 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -187,7 +187,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             layer.input_scale = None
 
         if self.strategy == QuantizationStrategy.BLOCK:
-            maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+            maybe_post_process_fp8_weight_block(layer)
 
     def apply_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 57b3736ed7fd1..a0159c39092c5 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -63,11 +63,10 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
-    expert_weight_is_col_major,
+    deepgemm_post_process_fp8_weight_block,
     maybe_post_process_fp8_weight_block,
     process_fp8_weight_block_strategy,
     process_fp8_weight_tensor_strategy,
-    requant_weight_ue8m0_inplace,
     validate_fp8_block_shape,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -98,7 +97,6 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
-    get_col_major_tma_aligned_tensor,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
 )
@@ -547,7 +545,7 @@ class Fp8LinearMethod(LinearMethodBase):
             return
 
         if self.block_quant:
-            maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+            maybe_post_process_fp8_weight_block(layer)
 
     def apply(
         self,
@@ -846,15 +844,31 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
-                if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                    layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                        layer.w13_weight_scale_inv
+            if self.allow_deep_gemm:
+                dg_w13_weight, dg_w13_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w13_weight.data,
+                        ws=layer.w13_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
                     )
-                if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                    layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                        layer.w2_weight_scale_inv
+                )
+                dg_w2_weight, dg_w2_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w2_weight.data,
+                        ws=layer.w2_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
                     )
+                )
+                layer.w13_weight = Parameter(dg_w13_weight, requires_grad=False)
+                layer.w13_weight_scale_inv = Parameter(
+                    dg_w13_weight_scale_inv, requires_grad=False
+                )
+                layer.w2_weight = Parameter(dg_w2_weight, requires_grad=False)
+                layer.w2_weight_scale_inv = Parameter(
+                    dg_w2_weight_scale_inv, requires_grad=False
+                )
 
         # If checkpoint is fp16, quantize in place.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
@@ -990,31 +1004,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_deep_gemm_e8m0_used() and self.block_quant:
-            assert layer.weight_block_size is not None
-            # Re-quantise the expert weights so their scales are UE8M0.
-            block_sz = tuple(layer.weight_block_size)
-            requant_weight_ue8m0_inplace(
-                layer.w13_weight.data,
-                layer.w13_weight_scale_inv.data,
-                block_sz,
-            )
-            requant_weight_ue8m0_inplace(
-                layer.w2_weight.data,
-                layer.w2_weight_scale_inv.data,
-                block_sz,
-            )
-
-            # Ensure column-major TMA alignment expected by DeepGEMM.
-            if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w13_weight_scale_inv
-                )
-            if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w2_weight_scale_inv
-                )
-
     def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
@@ -1037,7 +1026,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
-            BatchedTritonOrDeepGemmExperts,
+            BatchedDeepGemmExperts,
+            BatchedTritonExperts,
             TritonOrDeepGemmExperts,
         )
 
@@ -1053,20 +1043,24 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         ):
             max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
+
+            experts_impl = (
+                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+            )
             logger.debug(
-                "BatchedTritonOrDeepGemmExperts(%s): "
-                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                experts_impl.__name__,
                 self.__class__.__name__,
                 max_num_tokens_per_rank,
                 self.weight_block_size,
                 False,
             )
-            return BatchedTritonOrDeepGemmExperts(
+            return experts_impl(
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
             )
+
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 42a569e7770c0..68a122fd46c6b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -482,6 +482,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 8ebb863479601..61beb93d29d89 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -114,4 +114,4 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         # a to be [M, K]
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-        return rocm_aiter_ops.gemm_w8a8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
+        return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py
index 493507ba4313a..0904c7cf30438 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py
@@ -8,6 +8,7 @@ import torch
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils.platform_utils import get_cu_count
 from vllm.utils.torch_utils import direct_register_custom_op
 
 from .ScaledMMLinearKernel import (
@@ -35,7 +36,7 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
             out_dtype,
             As,
             Bs,
-            current_platform.get_cu_count(),
+            get_cu_count(),
             bias,
         )
     # Fallback
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index e339f15510d79..0f69a18a1f3fd 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -190,14 +190,25 @@ class Mxfp4Config(QuantizationConfig):
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            raise NotImplementedError("Mxfp4 linear layer is not implemented")
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
+            # if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod."
+            )
+            return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
             if current_platform.is_xpu():
                 return IpexMxfp4MoEMethod(layer.moe_config)
             else:
                 return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
-            raise NotImplementedError("Mxfp4 attention layer is not implemented")
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once(
+                "MXFP4 attention layer is not implemented. "
+                "Skipping quantization for this layer."
+            )
         return None
 
 
@@ -205,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )
@@ -741,15 +753,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
             )
 
-            self.w13_weight_triton_tensor = w13_weight
-            self.w2_weight_triton_tensor = w2_weight
-
-            # need to delete the original weights to save memory on single GPU
-            del layer.w13_weight
-            del layer.w2_weight
-            layer.w13_weight = None
-            layer.w2_weight = None
-            torch.cuda.empty_cache()
+            self.w13_weight = w13_weight
+            self.w2_weight = w2_weight
+            layer.w13_weight = w13_weight
+            layer.w2_weight = w2_weight
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
@@ -824,18 +831,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                     "EP batched experts format"
                 )
         else:
-            layer.w13_weight = (
-                self.w13_weight_triton_tensor
-                if layer.w13_weight is None
-                else layer.w13_weight
-            )
-            layer.w2_weight = (
-                self.w2_weight_triton_tensor
-                if layer.w2_weight is None
-                else layer.w2_weight
-            )
-            assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
-
             assert self.moe_quant_config is not None
             if (
                 self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
@@ -1070,8 +1065,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight_triton_tensor,
-                w2=self.w2_weight_triton_tensor,
+                w1=self.w13_weight,
+                w2=self.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,
@@ -1150,7 +1145,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
     ) -> torch.Tensor:
         assert activation == "swigluoai", (
             "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
-        )  # noqa:
+        )
         hidden_size_pad = round_up(self.original_hidden_size, 128)
         x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
         hidden_states = layer.ipex_fusion(
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index d5459594b7983..095a66ef10f9a 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -114,7 +114,14 @@ class QuarkConfig(QuantizationConfig):
             layer_quant_names = list(layer_quant_config.keys())
             layer_quant_set = set(layer_quant_names)
 
-            if not kv_cache_set.issubset(layer_quant_set):
+            if not (
+                kv_cache_set.issubset(layer_quant_set)
+                or any(
+                    fnmatch.fnmatchcase(layer_quant, pat)
+                    for layer_quant in list(layer_quant_set)
+                    for pat in list(kv_cache_set)
+                )
+            ):
                 raise ValueError(
                     "The Quark quantized model has the "
                     "kv_cache_group parameter setting, "
@@ -124,10 +131,15 @@ class QuarkConfig(QuantizationConfig):
                 )
 
             q_configs = [
-                cast(dict[str, Any], layer_quant_config.get(name))
-                for name in kv_cache_group
+                quant_cfg
+                for name, quant_cfg in layer_quant_config.items()
+                if any(fnmatch.fnmatchcase(name, pattern) for pattern in kv_cache_group)
             ]
-            if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):
+
+            if not all(
+                deep_compare(q_config["output_tensors"], q_configs[0]["output_tensors"])
+                for q_config in q_configs
+            ):
                 raise ValueError(
                     "The quantization method used for kv_cache should "
                     "be the same, but the quantization method for the "
@@ -312,9 +324,15 @@ class QuarkConfig(QuantizationConfig):
             layer_quant_config = cast(
                 dict[str, Any], self.quant_config.get("layer_quant_config")
             )
-            for name_pattern in layer_quant_config:
-                if fnmatch.fnmatch(layer_name, name_pattern):
-                    return layer_quant_config[name_pattern]
+
+            def _matches_pattern(layer_name, pattern):
+                if "*" not in pattern:
+                    return layer_name in pattern
+                return fnmatch.fnmatch(layer_name, pattern)
+
+            for name_pattern, config in layer_quant_config.items():
+                if _matches_pattern(layer_name, name_pattern):
+                    return config
 
             layer_type = cast(str, type(module))
             layer_type_quant_config = cast(
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index c63196b893574..541c6c631053d 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -34,6 +34,7 @@ from vllm.utils.deep_gemm import (
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
     should_use_deepgemm_for_fp8_linear,
+    transform_sf_into_required_layout,
 )
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -55,17 +56,13 @@ def cutlass_scaled_mm(
     Bs: torch.Tensor,
     block_size: list[int],
     output_dtype: torch.dtype = torch.float16,
-    is_hopper: bool | None = None,
 ) -> torch.Tensor:
-    if is_hopper is None:
-        is_hopper = current_platform.is_device_capability(90)
     return ops.cutlass_scaled_mm(
         A,
         B.T,
         out_dtype=output_dtype,
         scale_a=As,
-        # SM90 block FP8 requires row-major scale_b, which we do ahead of time
-        scale_b=Bs if block_size is not None and is_hopper else Bs.T,
+        scale_b=Bs.T,
     )
 
 
@@ -119,20 +116,27 @@ def _padded_cutlass(
         dim if dim % pad_multiple == 0 else dim + pad_multiple - (dim % pad_multiple)
     )
 
-    padded_shape = [padded, *qx.shape[1:]]
-    padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
-    padded_qx[0 : qx.shape[0], ...].copy_(qx)
+    has_pad = padded > dim
 
-    padded_x_scale_shape = [*x_scale.shape[1:], padded]
-    padded_x_scale = torch.ones(
-        padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype
-    ).permute(-1, -2)
-    padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale)
+    if has_pad:
+        padded_shape = [padded, *qx.shape[1:]]
+        padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
+        padded_qx[0 : qx.shape[0], ...].copy_(qx)
 
-    output = cutlass_scaled_mm(
-        padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype, True
-    )
-    return output[0 : qx.shape[0], ...]
+        padded_x_scale_shape = [*x_scale.shape[1:], padded]
+        padded_x_scale = torch.ones(
+            padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype
+        ).permute(-1, -2)
+        padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale)
+
+        output = cutlass_scaled_mm(
+            padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype
+        )
+        return output[0 : qx.shape[0], ...]
+    else:
+        return cutlass_scaled_mm(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
 
 
 def _padded_cutlass_fake(
@@ -303,7 +307,6 @@ class W8A8BlockFp8LinearOp:
                 weight_scale,
                 list(self.weight_group_shape),
                 input_2d.dtype,
-                False,
             )
 
     def _run_aiter(
@@ -325,7 +328,7 @@ class W8A8BlockFp8LinearOp:
         if use_triton:
             gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
         else:
-            gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_w8a8_blockscale
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale
 
         if input_scale is not None:
             q_input = input_2d
@@ -927,6 +930,50 @@ def requant_weight_ue8m0_inplace(
         s_old.copy_(s_requant)
 
 
+def deepgemm_post_process_fp8_weight_block(
+    wq: torch.Tensor, ws: torch.Tensor, quant_block_shape: tuple[int], use_e8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert wq.dtype == torch.float8_e4m3fn, (
+        "Expected quantized tensor dtype "
+        f"to be torch.float8_e4m3fn, got {wq.dtype} instead."
+    )
+    assert ws.dtype == torch.float32, (
+        f"Expected tensor scales dtype to be torch.float32, got {ws.dtype} instead"
+    )
+
+    if use_e8m0:
+        requant_weight_ue8m0_inplace(wq, ws, block_size=quant_block_shape)
+
+    original_ndim = wq.ndim
+    if wq.ndim == 2:
+        assert ws.ndim == 2
+        wq = wq.unsqueeze(0)
+        ws = ws.unsqueeze(0)
+
+    # From https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/utils/layout.hpp#L46
+    recipe = (1, 128, 128)
+
+    # Ref : https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/apis/gemm.hpp
+    # DeepGemm uses the `transform_sf_into_required_layout` function to
+    # represent scales in the correct format.
+    dg_ws = transform_sf_into_required_layout(
+        sf=ws,
+        mn=wq.size(1),
+        k=wq.size(2),
+        recipe=recipe,
+        num_groups=wq.size(0),
+        # is the scale factors for A in (Refers to the argument A in A @ B).
+        # Weights are B.
+        is_sfa=False,
+    )
+
+    if original_ndim == 2:
+        wq = wq.squeeze(0)
+        dg_ws = dg_ws.squeeze(0)
+
+    return wq, dg_ws
+
+
 def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
     """Pad the weight tensor. This is an optimization on ROCm platform, which
     can benefit from tensors located far enough from one another in memory"""
@@ -1125,9 +1172,7 @@ def process_fp8_weight_block_strategy(
     return weight, weight_scale
 
 
-def maybe_post_process_fp8_weight_block(
-    layer: torch.nn.Module, cutlass_block_fp8_supported: bool
-):
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
     assert layer.weight_block_size is not None
 
     from vllm.utils.deep_gemm import (
@@ -1141,20 +1186,15 @@ def maybe_post_process_fp8_weight_block(
     should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
         layer.orig_dtype, layer.weight
     )
-    if is_deep_gemm_e8m0_used() and should_use_deepgemm:
-        block_sz = tuple(layer.weight_block_size)
-        requant_weight_ue8m0_inplace(
-            layer.weight.data, layer.weight_scale.data, block_sz
-        )
-    # SM90 Block FP8 CUTLASS requires row-major weight scales
-    elif (
-        current_platform.is_device_capability(90)
-        and cutlass_block_fp8_supported
-        and not should_use_deepgemm
-    ):
-        layer.weight_scale = torch.nn.Parameter(
-            layer.weight_scale.data.T.contiguous(), requires_grad=False
+    if should_use_deepgemm:
+        dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
+            wq=layer.weight.data,
+            ws=layer.weight_scale.data,
+            quant_block_shape=tuple(layer.weight_block_size),
+            use_e8m0=is_deep_gemm_e8m0_used(),
         )
+        layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 2ef54e75df44e..ce4f40680b0a3 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -98,6 +98,39 @@ class RotaryEmbedding(RotaryEmbeddingBase):
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
 
+    @staticmethod
+    def forward_static(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        head_size: int,
+        rotary_dim: int,
+        cos_sin_cache: torch.Tensor,
+        is_neox_style: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A PyTorch-native implementation of forward()."""
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, head_size)
+            key_rot = key[..., :rotary_dim]
+            key_pass = key[..., rotary_dim:]
+            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def forward_native(
         self,
         positions: torch.Tensor,
@@ -105,27 +138,15 @@ class RotaryEmbedding(RotaryEmbeddingBase):
         key: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """A PyTorch-native implementation of forward()."""
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., : self.rotary_dim]
-        query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, self.is_neox_style)
-        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
-
-        # key may be None in some cases, e.g. cross-layer KV sharing
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-            key_rot = key[..., : self.rotary_dim]
-            key_pass = key[..., self.rotary_dim :]
-            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, self.is_neox_style)
-            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
+        return self.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
 
     def forward_cuda(
         self,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index b17bdd0b72078..4b7ba2eed94cf 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -8,8 +8,10 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.platform_utils import get_cu_count
 from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
@@ -105,8 +107,7 @@ def default_unquantized_gemm(
 
 def use_aiter_triton_gemm(n, m, k, dtype):
     if (
-        envs.VLLM_ROCM_USE_AITER == 0
-        or envs.VLLM_ROCM_USE_AITER_TRITON_GEMM == 0
+        not rocm_aiter_ops.is_triton_gemm_enabled()
         # MI300's - fp8nuz=True
         or current_platform.is_fp8_fnuz()
         or dtype not in [torch.float16, torch.bfloat16]
@@ -151,7 +152,7 @@ def rocm_unquantized_gemm_impl(
 
     x_view = x.reshape(-1, x.size(-1))
     if m > 8 and 0 < n <= 4:
-        cu_count = current_platform.get_cu_count()
+        cu_count = get_cu_count()
         out = ops.wvSplitK(weight, x_view, cu_count, bias)
         return out.reshape(*x.shape[:-1], weight.shape[0])
     elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 3dbe803f99860..93986e5f2fc0a 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -82,7 +82,8 @@ enable_hf_transfer()
 
 class DisabledTqdm(tqdm):
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, disable=True)
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
 
 
 def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 72e5ddcf1abeb..0a8f21abb0a35 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -346,24 +345,18 @@ class ApertusModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -389,7 +382,7 @@ class ApertusModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -403,7 +396,7 @@ class ApertusModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -518,9 +511,7 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config,
@@ -529,20 +520,9 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -551,7 +531,7 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -577,8 +557,8 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             vllm_config=vllm_config, prefix=prefix, layer_type=layer_type
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 08bf1a6aad75b..20c3ff0754506 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -200,7 +199,6 @@ class ArceeModel(nn.Module):
         self.quant_config = quant_config
         self.config = config
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         # Word embeddings (parallelized if using pipeline parallel)
         if get_pp_group().is_first_rank or (
@@ -209,7 +207,6 @@ class ArceeModel(nn.Module):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -242,7 +239,7 @@ class ArceeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -257,7 +254,7 @@ class ArceeModel(nn.Module):
             hidden_states = (
                 inputs_embeds
                 if inputs_embeds is not None
-                else self.get_input_embeddings(input_ids)
+                else self.embed_input_ids(input_ids)
             )
             residual = None
         else:
@@ -383,13 +380,10 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         if get_pp_group().is_last_rank:
             # Determine vocabulary size (including any LoRA extra tokens
             # for padded LM head)
-            self.unpadded_vocab_size = config.vocab_size
 
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=vllm_config.quant_config,
                 bias=getattr(config, "lm_head_bias", False),
                 prefix=f"{prefix}.lm_head",
@@ -399,7 +393,7 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             # Placeholder for lm_head on non-last ranks
@@ -429,8 +423,8 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         """Load weights into the model (delegates to inner model and handles
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index bb505219ea17c..b5cc07a56535d 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -442,7 +442,7 @@ class ArcticModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -456,7 +456,7 @@ class ArcticModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -490,16 +490,14 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.num_experts = config.num_local_experts
         self.num_experts_per_tok = config.num_experts_per_tok
-        self.unpadded_vocab_size = config.vocab_size
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 222a425790543..3d07e6b612ca3 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -547,18 +547,14 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         self.pad_token_id = (
             self.config.pad_token_id if self.config.pad_token_id is not None else -1
         )
-        self.unpadded_vocab_size = config.text_config.vocab_size
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            self.vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(self.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
@@ -617,7 +613,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -633,8 +629,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         **kwargs: object,
     ) -> torch.Tensor | IntermediateTensors:
         if inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            multimodal_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 multimodal_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 839ab5947e094..0ada2ed5028bb 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -417,7 +417,7 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 39990b9fd6837..8991ef4c606b6 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -309,7 +309,7 @@ class BaiChuanModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -323,7 +323,7 @@ class BaiChuanModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -402,9 +402,9 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
         self.model = BaiChuanModel(
@@ -426,8 +426,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 1549c653482f6..a878134022565 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -438,7 +438,7 @@ class BailingMoeModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -452,7 +452,7 @@ class BailingMoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -581,10 +581,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         config = vllm_config.model_config.hf_config.get_text_config()
         vllm_config.model_config.hf_config = config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
         self.max_position_embeddings = config.max_position_embeddings
         self.model = BailingMoeModel(
@@ -610,8 +608,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index bc7dbb618f65c..e0a2defd5127e 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -284,21 +283,14 @@ class BambaModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
@@ -322,7 +314,7 @@ class BambaModel(nn.Module):
 
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -336,7 +328,7 @@ class BambaModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -478,7 +470,7 @@ class BambaForCausalLM(
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
 
@@ -488,31 +480,21 @@ class BambaForCausalLM(
         self.model = BambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 1c2334a785437..2679448bce775 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -375,7 +375,7 @@ class BertModel(nn.Module, SupportsQuant):
         self.embeddings = embedding_class(self.config)
         self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder")
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.word_embeddings(input_ids)
 
     def forward(
@@ -486,8 +486,8 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         )
         self.pooler = self._build_pooler(pooler_config)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -835,8 +835,8 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.bert.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
@@ -893,8 +893,8 @@ class BertForTokenClassification(nn.Module):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.bert.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 31fdc4d21245a..131cb68914cf3 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -463,7 +463,7 @@ class BertWithRope(nn.Module, SupportsQuant):
         )
         self.pooler = BertPooler(self.config) if add_pooling_layer else None
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -714,8 +714,8 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
         loaded_params = loader.load_weights(weights)
         return loaded_params
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.new.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.new.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2986a72f2e487..f71b9c01d359d 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -630,7 +630,7 @@ class Blip2ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 18b09ee43b7b0..00fba93423d8e 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -271,7 +271,7 @@ class BloomModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -285,7 +285,7 @@ class BloomModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states = self.word_embeddings_layernorm(hidden_states)
         else:
             assert intermediate_tensors is not None
@@ -353,8 +353,8 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 54ff6991fa702..fb7476c45fcdb 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -886,7 +886,7 @@ class ChameleonModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
@@ -912,7 +912,7 @@ class ChameleonModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -963,9 +963,9 @@ class ChameleonForConditionalGeneration(
         self.model = ChameleonModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -973,9 +973,7 @@ class ChameleonForConditionalGeneration(
             self.lm_head.weight = self.model.embed_tokens.weight
 
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -1000,7 +998,7 @@ class ChameleonForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -1008,7 +1006,7 @@ class ChameleonForConditionalGeneration(
         image_tokens = self.model.get_image_tokens(
             image_input["data"].to(self.config.dtype)
         )
-        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        vision_embeddings = self.model.embed_input_ids(image_tokens)
         return vision_embeddings
 
     def forward(
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index bcbe82b78c3b1..5d6f5e9125a28 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -353,7 +353,7 @@ class ChatGLMModel(nn.Module, SupportsQuant):
             self.encoder.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
@@ -368,7 +368,7 @@ class ChatGLMModel(nn.Module, SupportsQuant):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -433,10 +433,9 @@ class ChatGLMBaseModel(nn.Module):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
-        self.lora_config = lora_config
         self.multimodal_config = multimodal_config
 
         self.quant_config = quant_config
@@ -452,8 +451,8 @@ class ChatGLMBaseModel(nn.Module):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 27953c27188d9..50f476dfd185b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -561,7 +561,7 @@ class CLIPTextTransformer(nn.Module):
             eps=config.layer_norm_eps,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.token_embedding(input_ids)
 
     def forward(
@@ -842,7 +842,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
             }
         )
 
-        # Assumes that self.forward is called after self.get_input_embeddings
+        # Assumes that self.forward is called after self.embed_input_ids
         self._is_text_input = True
 
     def get_text_features(
@@ -903,7 +903,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -917,16 +917,16 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 19cc31c9bd18b..139ccba9df6d8 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -439,7 +439,7 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 75459601f76b0..77bb178519813 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -288,17 +288,12 @@ class CohereModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.quant_config = quant_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size, config.hidden_size
         )
@@ -316,7 +311,7 @@ class CohereModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -330,7 +325,7 @@ class CohereModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -424,17 +419,15 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # currently all existing command R models have `tie_word_embeddings`
         # enabled
         assert config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.quant_config = quant_config
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale
+            config.vocab_size, scale=config.logit_scale
         )
         self.model = CohereModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
@@ -443,8 +436,8 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     @torch.no_grad()
     def forward(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 22095d05848ce..528ef4f76742d 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -355,7 +354,7 @@ class DbrxModel(nn.Module):
             ["hidden_states"], config.d_model
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -369,7 +368,7 @@ class DbrxModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
@@ -441,27 +440,23 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
         if config.tie_word_embeddings:
             raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
         self.quant_config = quant_config
-        self.unpadded_vocab_size = config.vocab_size
+
         self.transformer = DbrxModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
         )
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.d_model,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index fd2f20ea501d0..9e834a73f8e5e 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -73,7 +73,7 @@ class DeepseekV2Model(nn.Module):
         self.hnorm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
         self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -222,8 +222,8 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
         self.num_moe_layers = self.config.num_hidden_layers
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 26b9c25e6bdb5..e028dc497aa6a 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -142,7 +142,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -206,8 +206,8 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                 self.moe_layers.append(layer.mlp.experts)
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 0432567521843..c89caab93a1ee 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -557,9 +557,7 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 38189e17f7d8b..115818d903a6d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1236,7 +1236,7 @@ class DeepseekV2Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1250,7 +1250,7 @@ class DeepseekV2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1389,8 +1389,8 @@ class DeepseekV2ForCausalLM(
 
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 306eef3dca990..e7b48e0f4e554 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -619,7 +619,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 15caa3184581d..d24da0c42a254 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -398,7 +398,7 @@ class Dots1Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -412,7 +412,7 @@ class Dots1Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -541,8 +541,8 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 6d462ad8ae620..25e5588961a63 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -9,7 +9,7 @@ import torch.nn.functional as F
 from torch.nn import LayerNorm
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -256,7 +256,7 @@ class DotsVisionAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -303,17 +303,17 @@ class DotsVisionAttention(nn.Module):
             )
         )
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Unsupported vision attention backend: {self.attn_backend}"
             )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -361,7 +361,7 @@ class DotsVisionAttention(nn.Module):
                 self.num_attention_heads_per_partition,
                 self.hidden_size_per_attention_head,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 s = int(cu_seqlens[i - 1])
@@ -373,7 +373,7 @@ class DotsVisionAttention(nn.Module):
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -514,7 +514,7 @@ class DotsVisionBlock(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -567,7 +567,7 @@ class DotsVisionTransformer(nn.Module):
         require_post_norm: bool | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -582,10 +582,11 @@ class DotsVisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
         num_layers = (
@@ -666,11 +667,11 @@ class DotsVisionTransformer(nn.Module):
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -839,7 +840,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -857,8 +858,8 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_id,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index b35666175ea7b..f2999968669f6 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -465,7 +465,7 @@ class Ernie4_5_MoeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -479,7 +479,7 @@ class Ernie4_5_MoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -726,8 +726,8 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index f287cff12086b..daa5bf03ea4a9 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -34,9 +34,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
@@ -164,7 +165,7 @@ class Ernie4_5_VisionAttention(nn.Module):
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -211,17 +212,17 @@ class Ernie4_5_VisionAttention(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Ernie45-VL does not support {self.attn_backend} backend now."
             )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -291,7 +292,7 @@ class Ernie4_5_VisionAttention(nn.Module):
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -310,7 +311,7 @@ class Ernie4_5_VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -370,7 +371,7 @@ class Ernie4_5_VisionBlock(nn.Module):
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -463,7 +464,7 @@ class Ernie4_5_VisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         patch_size = vision_config.patch_size
@@ -515,10 +516,11 @@ class Ernie4_5_VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -565,11 +567,11 @@ class Ernie4_5_VisionTransformer(nn.Module):
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -1432,15 +1434,16 @@ class Ernie4_5_VLMoeForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for Ernie VL."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.im_patch_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
@@ -1448,10 +1451,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         temporal_conv_size = hf_config.temporal_conv_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -1483,11 +1483,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_conv_size,
@@ -1518,11 +1514,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
                     mm_data_idx += 1
 
                 elif modality_type == "video":
-                    t, h, w = (
-                        video_grid_thw[mm_data_idx][0],
-                        video_grid_thw[mm_data_idx][1],
-                        video_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = video_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t // temporal_conv_size,
                         h // spatial_conv_size,
@@ -1664,9 +1656,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
         return modalities
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1689,7 +1679,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1702,9 +1692,9 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index d002d1838c8ea..e8ef86f9b7f01 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -561,7 +561,7 @@ class Ernie4_5_VLMoeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -577,7 +577,7 @@ class Ernie4_5_VLMoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -642,8 +642,8 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
index e7036840388cc..1b9abc3572a3b 100644
--- a/vllm/model_executor/models/ernie_mtp.py
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -112,7 +112,7 @@ class ErnieMultiTokenPredictor(nn.Module):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -160,8 +160,8 @@ class ErnieMTP(nn.Module, SupportsPP):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 84fb52d138545..6c56bfc433c7a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -323,16 +322,11 @@ class ExaoneModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.wte = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
@@ -340,7 +334,6 @@ class ExaoneModel(nn.Module):
             self.wte = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -364,7 +357,7 @@ class ExaoneModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -378,7 +371,7 @@ class ExaoneModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -489,10 +482,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.transformer = ExaoneModel(
@@ -500,18 +492,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -520,7 +503,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -529,8 +512,8 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index d5e4d9a1486f7..b89e168ada20e 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -311,23 +310,17 @@ class Exaone4Model(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -351,7 +344,7 @@ class Exaone4Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -365,7 +358,7 @@ class Exaone4Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -476,10 +469,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
 
         self.model = Exaone4Model(
@@ -487,18 +478,9 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -507,7 +489,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -516,8 +498,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 1b9c7da334909..85acdff3d96b4 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -399,7 +399,7 @@ class FalconModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -413,7 +413,7 @@ class FalconModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -515,8 +515,8 @@ class FalconForCausalLM(nn.Module, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index ac5846cfd8695..3653425b8e1ca 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -424,21 +423,15 @@ class FalconH1Model(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
             self.embedding_multiplier = config.embedding_multiplier
         else:
@@ -468,7 +461,7 @@ class FalconH1Model(nn.Module):
         else:
             self.final_layernorm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -483,7 +476,7 @@ class FalconH1Model(nn.Module):
                 hidden_states = inputs_embeds * self.embedding_multiplier
             else:
                 hidden_states = (
-                    self.get_input_embeddings(input_ids) * self.embedding_multiplier
+                    self.embed_input_ids(input_ids) * self.embedding_multiplier
                 )
         else:
             assert intermediate_tensors is not None
@@ -572,7 +565,7 @@ class FalconH1ForCausalLM(
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         self.quant_config = vllm_config.quant_config
@@ -584,21 +577,11 @@ class FalconH1ForCausalLM(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.tie_word_embeddings = config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.lm_head_multiplier = config.lm_head_multiplier
@@ -607,7 +590,7 @@ class FalconH1ForCausalLM(
             # Used to track and store by the Mamba cache between steps.
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.vocab_size,
                 scale=config.lm_head_multiplier,
             )
@@ -618,8 +601,8 @@ class FalconH1ForCausalLM(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 005fac4b1f05d..269c36ab5b9c7 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -333,7 +333,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 46b111f4d9396..7aaae7c503b58 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -293,7 +293,7 @@ class GemmaModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -307,7 +307,7 @@ class GemmaModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -382,12 +382,10 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = GemmaModel(
@@ -398,8 +396,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 1938efd4895e5..4d5d6cbb37c62 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -290,7 +290,7 @@ class Gemma2Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -304,7 +304,7 @@ class Gemma2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -393,8 +393,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
@@ -410,8 +409,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 80ec40f478c6d..357e61a4e78bf 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -393,7 +393,7 @@ class Gemma3Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         # NOTE(woosuk): Only apply the normalizer to the output of
         # vocab embedding. Don't apply it to the vision embedding.
         return self.embed_tokens(input_ids) * self.normalizer
@@ -410,7 +410,7 @@ class Gemma3Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -524,8 +524,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
@@ -541,8 +540,8 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 8e1dbd9e2cea7..02fb7ef31dc94 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -596,7 +596,7 @@ class Gemma3ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 547884f393eb0..64443190f53ed 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -685,7 +685,7 @@ class Gemma3nSelfDecoder(nn.Module):
             per_layer_inputs = per_layer_projection
         return per_layer_inputs
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids) * self.embed_scale
 
     def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
@@ -712,7 +712,7 @@ class Gemma3nSelfDecoder(nn.Module):
         if inputs_embeds is not None:
             hidden_states_0 = inputs_embeds
         else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
+            hidden_states_0 = self.embed_input_ids(input_ids)
 
         adjusted_per_layer_inputs = self.get_per_layer_inputs(
             hidden_states_0, per_layer_inputs
@@ -881,8 +881,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
     def get_per_layer_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.self_decoder.get_per_layer_input_embeddings(input_ids)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.self_decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.self_decoder.embed_input_ids(input_ids)
 
     def fast_prefill_forward(
         self,
@@ -1114,8 +1114,7 @@ class Gemma3nForCausalLM(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         self.cache_config = vllm_config.cache_config
@@ -1126,8 +1125,8 @@ class Gemma3nForCausalLM(nn.Module):
             config.vocab_size, soft_cap=config.final_logit_softcapping
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 2b727a538bf25..6ae76976eb46c 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -645,7 +645,7 @@ class Gemma3nForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if mm_input_by_modality is None:
             return []
@@ -664,7 +664,7 @@ class Gemma3nForConditionalGeneration(
                 multimodal_embeddings.extend(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
@@ -689,9 +689,9 @@ class Gemma3nForConditionalGeneration(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
@@ -709,10 +709,10 @@ class Gemma3nForConditionalGeneration(
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE (NickLucche) During profiling, `get_input_embeddings` is not
+        # NOTE (NickLucche) During profiling, `embed_input_ids` is not
         # called, hence we don't have input_ids to compute PLEs. We simply
         # select a chunk of pre-allocated PLEs. During normal execution,
-        # `get_input_embeddings` is called before forward, hence this slice
+        # `embed_input_ids` is called before forward, hence this slice
         # will contain PLEs computed from the actual input_ids.
         per_layer_inputs = self.per_layer_embeddings[: inputs_embeds.shape[0]]
 
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index d7fd2b109d24f..faa0674a2e43d 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -248,10 +248,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Glm4Model(
@@ -277,8 +275,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index b9cd3545ec453..b2d4fe0c0139b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,7 +37,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor,
@@ -46,7 +46,7 @@ from transformers.models.glm4v.image_processing_glm4v import (
 from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -252,7 +253,7 @@ class Glm4vVisionAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -306,18 +307,18 @@ class Glm4vVisionAttention(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"GLM-4V does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -377,7 +378,7 @@ class Glm4vVisionAttention(nn.Module):
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -396,7 +397,7 @@ class Glm4vVisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -425,7 +426,7 @@ class Glm4vVisionBlock(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -703,7 +704,7 @@ class Glm4vVisionTransformer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -772,10 +773,11 @@ class Glm4vVisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -824,8 +826,8 @@ class Glm4vVisionTransformer(nn.Module):
         max_seqlen, seqlens = None, None
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         return max_seqlen, seqlens
@@ -1592,9 +1594,7 @@ class Glm4vForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1618,25 +1618,23 @@ class Glm4vForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: "PretrainedConfig",
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -1668,11 +1666,7 @@ class Glm4vForConditionalGeneration(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_merge_size,
@@ -1705,8 +1699,7 @@ class Glm4vForConditionalGeneration(
                 elif modality_type == "video":
                     t, h, w = (
                         video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
+                        *image_grid_thw[mm_data_idx][1:],
                     )
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index b30bd66161da9..1422dbe9b3cd0 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -455,7 +455,7 @@ class Glm4MoeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -469,7 +469,7 @@ class Glm4MoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -704,8 +704,8 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExper
 
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 9a2ae3c476f07..110ed0a646334 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -149,7 +149,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -211,8 +211,8 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
                 self.moe_layers.append(layer.mlp.experts)
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -256,11 +256,18 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
+        spec_layer = self.model.mtp_start_layer_idx
         for name, loaded_weight in weights:
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is None:
-                continue
-            name = self._rewrite_spec_layer_name(spec_layer, name)
+            if name == "lm_head.weight":
+                name = f"model.layers.{spec_layer}.shard_head.head.weight"
+            elif name == "model.embed_tokens.weight":
+                # This name is same with local model, rewriting is not needed.
+                pass
+            else:
+                spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+                if spec_layer is None:
+                    continue
+                name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ebf6934dddead..1c18ea0745f2b 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -15,7 +15,7 @@ from torch import nn
 from torch.nn import LayerNorm
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
+from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -36,6 +36,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
@@ -622,25 +623,23 @@ class GLM4VForCausalLM(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -672,11 +671,7 @@ class GLM4VForCausalLM(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_merge_size,
@@ -709,8 +704,7 @@ class GLM4VForCausalLM(
                 elif modality_type == "video":
                     t, h, w = (
                         video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
+                        *image_grid_thw[mm_data_idx][1:],
                     )
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
@@ -762,9 +756,9 @@ class GLM4VForCausalLM(
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    get_input_embeddings = SupportsMultiModal.get_input_embeddings
+    embed_input_ids = SupportsMultiModal.embed_input_ids
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 6d99d02a32be2..a5e8131c7fba9 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -213,7 +213,7 @@ class GPT2Model(nn.Module):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -225,7 +225,7 @@ class GPT2Model(nn.Module):
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -293,8 +293,8 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -365,8 +365,8 @@ class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index f2c8e2aeb8225..cdf038ba25c92 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -207,18 +207,13 @@ class GPTBigCodeModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         assert not config.add_cross_attention
 
         self.embed_dim = config.hidden_size
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.wte = VocabParallelEmbedding(
             self.vocab_size, self.embed_dim, org_num_embeddings=config.vocab_size
         )
@@ -235,7 +230,7 @@ class GPTBigCodeModel(nn.Module):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -247,7 +242,7 @@ class GPTBigCodeModel(nn.Module):
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             hidden_states = inputs_embeds + self.wpe(position_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -290,10 +285,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(
@@ -305,21 +298,16 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head = ParallelLMHead(
                 self.transformer.vocab_size,
                 self.transformer.embed_dim,
-                org_num_embeddings=self.config.vocab_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e04b2465e54ae..e416ecde0c1e0 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -215,7 +215,7 @@ class GPTJModel(nn.Module):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -229,7 +229,7 @@ class GPTJModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -319,8 +319,8 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index e6c145602d29a..af0c9209231cb 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -229,7 +229,7 @@ class GPTNeoXModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_in(input_ids)
 
     def forward(
@@ -243,7 +243,7 @@ class GPTNeoXModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.layers, self.start_layer, self.end_layer):
@@ -317,8 +317,8 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
             self.gpt_neox.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.gpt_neox.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 04038ae74882d..692ef605fe175 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -92,7 +92,7 @@ class OAIAttention(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.rope_theta = config.rope_theta
 
-        self.qkv = QKVParallelLinear(
+        self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
             head_size=self.head_dim,
             total_num_heads=self.num_attention_heads,
@@ -129,7 +129,7 @@ class OAIAttention(nn.Module):
     def forward(
         self, hidden_states: torch.Tensor, positions: torch.Tensor
     ) -> torch.Tensor:
-        qkv, _ = self.qkv(hidden_states)
+        qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
@@ -198,6 +198,7 @@ class TransformerBlock(torch.nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
+        quant_config: QuantizationConfig,
         prefix: str = "",
     ):
         super().__init__()
@@ -207,7 +208,10 @@ class TransformerBlock(torch.nn.Module):
 
         self.layer_idx = extract_layer_index(prefix)
         self.attn = OAIAttention(
-            config, prefix=f"{prefix}.attn", cache_config=cache_config
+            config,
+            prefix=f"{prefix}.attn",
+            quant_config=quant_config,
+            cache_config=cache_config,
         )
         self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp")
         self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
@@ -243,6 +247,7 @@ class GptOssModel(nn.Module):
     ):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.quant_config = vllm_config.quant_config
         self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
         self.embedding = VocabParallelEmbedding(
@@ -254,6 +259,7 @@ class GptOssModel(nn.Module):
             lambda prefix: TransformerBlock(
                 vllm_config,
                 prefix=prefix,
+                quant_config=self.quant_config,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -263,7 +269,7 @@ class GptOssModel(nn.Module):
         )
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
@@ -277,7 +283,7 @@ class GptOssModel(nn.Module):
             if inputs_embeds is not None:
                 x = inputs_embeds
             else:
-                x = self.get_input_embeddings(input_ids)
+                x = self.embed_input_ids(input_ids)
 
             residual = None
         else:
@@ -600,9 +606,9 @@ class GptOssModel(nn.Module):
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv", ".q_proj", "q"),
-            (".qkv", ".k_proj", "k"),
-            (".qkv", ".v_proj", "v"),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
         ]
 
         tp_rank = get_tensor_model_parallel_rank()
@@ -645,7 +651,7 @@ class GptOssModel(nn.Module):
 
 
 class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
-    packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
@@ -697,8 +703,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 5fc8718ca75e5..c44b4021471ef 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -318,7 +318,7 @@ class GraniteModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -332,7 +332,7 @@ class GraniteModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
 
             hidden_states *= self.config.embedding_multiplier
         else:
@@ -473,8 +473,8 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         else:
             self.lm_head = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 3ddf02bbba2ea..1797adab8d146 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -767,7 +767,7 @@ class GraniteSpeechForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self,
         **kwargs: object,
     ) -> MultiModalEmbeddings:
@@ -779,7 +779,7 @@ class GraniteSpeechForConditionalGeneration(
         audio_features = self._process_audio_input(audio_input)
         return audio_features
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -790,9 +790,9 @@ class GraniteSpeechForConditionalGeneration(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index e683f30805f37..5c6759ded0669 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -50,7 +50,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -296,22 +295,15 @@ class GraniteMoeModel(nn.Module):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
@@ -323,7 +315,7 @@ class GraniteMoeModel(nn.Module):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -337,7 +329,7 @@ class GraniteMoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -518,26 +510,16 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.model = GraniteMoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -545,13 +527,12 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index bac64eec8c558..05177f1d1ac2c 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -334,22 +333,15 @@ class GraniteMoeHybridModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
@@ -374,7 +366,7 @@ class GraniteMoeHybridModel(nn.Module):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -388,7 +380,7 @@ class GraniteMoeHybridModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
                 hidden_states = hidden_states * self.embedding_multiplier
             residual = None
         else:
@@ -605,6 +597,9 @@ class GraniteMoeHybridForCausalLM(
             "k_proj",
             "v_proj",
         ],
+        "conv1d": ["conv1d"],
+        "in_proj": ["in_proj"],
+        "input_linear": ["input_linear"],
     }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -658,7 +653,7 @@ class GraniteMoeHybridForCausalLM(
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
         self.config = config
@@ -666,26 +661,17 @@ class GraniteMoeHybridForCausalLM(
         self.model = GraniteMoeHybridModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
@@ -694,8 +680,8 @@ class GraniteMoeHybridForCausalLM(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index e222109f2a949..926c539af33be 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -159,23 +158,16 @@ class GraniteMoeSharedModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
         self.embedding_multiplier = config.embedding_multiplier
@@ -190,7 +182,7 @@ class GraniteMoeSharedModel(nn.Module):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -204,7 +196,7 @@ class GraniteMoeSharedModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -281,26 +273,16 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.model = GraniteMoeSharedModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -308,13 +290,13 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index d77a0bc2993a0..9dc231863f74f 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -305,18 +304,13 @@ class Grok1Model(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embedding_multiplier_scale = getattr(
             config, "embedding_multiplier_scale", DEFAULT_EMBEDDING_MULTIPLIER_SCALE
         )
@@ -324,7 +318,6 @@ class Grok1Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
 
@@ -341,7 +334,7 @@ class Grok1Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         hidden_states = hidden_states * self.embedding_multiplier_scale
         return hidden_states
@@ -357,7 +350,7 @@ class Grok1Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -499,25 +492,18 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = Grok1Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -529,15 +515,15 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE
         )
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, self.output_multiplier_scale
+            config.vocab_size, scale=self.output_multiplier_scale
         )
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 8fa9776bd0186..1eadcbe67ade3 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -57,7 +57,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -606,7 +605,7 @@ class HunYuanModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         eplb_config = vllm_config.parallel_config.eplb_config
         enable_eplb = vllm_config.parallel_config.enable_eplb
         self.num_redundant_experts = eplb_config.num_redundant_experts
@@ -614,20 +613,15 @@ class HunYuanModel(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -649,7 +643,7 @@ class HunYuanModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -663,7 +657,7 @@ class HunYuanModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -937,12 +931,9 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
 
         self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -951,7 +942,7 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -996,8 +987,8 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
         )
         return loader.load_weights(weights)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
 
 class HunYuanMoEV1Base(HunyuanV1ModelBase, MixtureOfExperts):
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 3d28ba951b94e..db46353efde5c 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -732,7 +732,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self,
         **kwargs: object,
     ) -> MultiModalEmbeddings:
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 06ca8c4886341..9c5f9389e54bb 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -550,8 +550,8 @@ class Idefics3Model(nn.Module):
 
         return image_hidden_states
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.text_model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.text_model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -674,7 +674,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index d6a8f86d998bb..929bfaaee5cbb 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -16,7 +16,6 @@ import numpy as np
 import torch
 import torch.nn as nn
 from torch import Tensor
-from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
@@ -32,10 +31,12 @@ from .interfaces_base import VllmModel, is_pooling_model
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
+    from vllm.multimodal.inputs import MultiModalFeatureSpec
     from vllm.sequence import IntermediateTensors
 else:
     VllmConfig = object
     WeightsMapper = object
+    MultiModalFeatureSpec = object
     IntermediateTensors = object
 
 logger = init_logger(__name__)
@@ -93,7 +94,7 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """
         Returns multimodal embeddings generated from multimodal kwargs
         to be merged with text embeddings.
@@ -103,7 +104,13 @@ class SupportsMultiModal(Protocol):
             the appearances of their corresponding multimodal data item in the
             input prompt.
         """
-        ...
+        if hasattr(self, "get_multimodal_embeddings"):
+            logger.warning_once(
+                "`get_multimodal_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_multimodal`."
+            )
+            return self.get_multimodal_embeddings(**kwargs)
 
     def get_language_model(self) -> VllmModel:
         """
@@ -118,10 +125,10 @@ class SupportsMultiModal(Protocol):
         ...
 
     @overload
-    def get_input_embeddings(self, input_ids: Tensor) -> Tensor: ...
+    def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
 
     @overload
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: Tensor,
         multimodal_embeddings: MultiModalEmbeddings,
@@ -130,17 +137,17 @@ class SupportsMultiModal(Protocol):
         handle_oov_mm_token: bool = False,
     ) -> Tensor: ...
 
-    def _get_text_embeddings(
+    def _embed_text_input_ids(
         self,
         input_ids: Tensor,
-        get_input_embeddings: Callable[[Tensor], Tensor],
+        embed_input_ids: Callable[[Tensor], Tensor],
         *,
         is_multimodal: Tensor | None,
         handle_oov_mm_token: bool,
     ) -> Tensor:
         if handle_oov_mm_token and is_multimodal is not None:
             is_text = ~is_multimodal
-            text_embeds = get_input_embeddings(input_ids[is_text])
+            text_embeds = embed_input_ids(input_ids[is_text])
 
             return torch.empty(
                 (input_ids.shape[0], text_embeds.shape[1]),
@@ -148,9 +155,9 @@ class SupportsMultiModal(Protocol):
                 device=text_embeds.device,
             ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
 
-        return get_input_embeddings(input_ids)
+        return embed_input_ids(input_ids)
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -166,15 +173,15 @@ class SupportsMultiModal(Protocol):
 
         In case the multi-modal token IDs exceed the vocabulary size of
         the language model, you can set `handle_oov_mm_token=False`
-        to avoid calling the language model's `get_input_embeddings` method
+        to avoid calling the language model's `embed_input_ids` method
         on those tokens. Note however that doing so increases memory usage
         as an additional buffer is needed to hold the input embeddings.
         """
         from .utils import _merge_multimodal_embeddings
 
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.get_language_model().get_input_embeddings,
+            self.get_language_model().embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -184,7 +191,7 @@ class SupportsMultiModal(Protocol):
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
@@ -991,12 +998,7 @@ class SupportsMRoPE(Protocol):
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list["MultiModalFeatureSpec"],
     ) -> tuple[torch.Tensor, int]:
         """
         Get M-RoPE input positions and delta value for this specific model.
@@ -1006,17 +1008,11 @@ class SupportsMRoPE(Protocol):
 
         Args:
             input_tokens: List of input token IDs
-            hf_config: HuggingFace model configuration
-            image_grid_thw: Image grid dimensions (t, h, w)
-            video_grid_thw: Video grid dimensions (t, h, w)
-            second_per_grid_ts: Seconds per grid timestep for videos
-            audio_feature_lengths: Audio feature lengths for multimodal models
-            use_audio_in_video: Whether to use audio in video for interleaving
+            mm_features: Information about each multi-modal data item
 
         Returns:
-            Tuple of (llm_positions, mrope_position_delta)
-            - llm_positions: Tensor of shape [3, num_tokens]
-                with T/H/W positions
+            Tuple of `(llm_positions, mrope_position_delta)`
+            - llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
             - mrope_position_delta: Delta for position calculations
         """
         ...
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index d87a65a47083c..4267b6c6598e2 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -41,24 +41,19 @@ T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
 class VllmModel(Protocol[T_co]):
     """The interface required for all models in vLLM."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None: ...
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: ...
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Apply token embeddings to `input_ids`."""
-        ...
+        if hasattr(self, "get_input_embeddings"):
+            logger.warning_once(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            return self.get_input_embeddings(input_ids)
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-    ) -> T_co: ...
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...
 
 
 def _check_vllm_model_init(model: type[object] | object) -> bool:
@@ -66,11 +61,19 @@ def _check_vllm_model_init(model: type[object] | object) -> bool:
     return supports_kw(model_init, "vllm_config")
 
 
-def _check_vllm_model_get_input_embeddings(model: type[object] | object) -> bool:
-    model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
-    if not callable(model_get_input_embeddings):
+def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
+    model_embed_input_ids = getattr(model, "embed_input_ids", None)
+    if not callable(model_embed_input_ids):
+        model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
+        if callable(model_get_input_embeddings):
+            logger.warning(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            model.embed_input_ids = model_get_input_embeddings
         logger.warning(
-            "The model (%s) is missing the `get_input_embeddings` method.",
+            "The model (%s) is missing the `embed_input_ids` method.",
             model,
         )
         return False
@@ -110,7 +113,7 @@ def is_vllm_model(
 ) -> TypeIs[type[VllmModel]] | TypeIs[VllmModel]:
     return (
         _check_vllm_model_init(model)
-        and _check_vllm_model_get_input_embeddings(model)
+        and _check_vllm_model_embed_input_ids(model)
         and _check_vllm_model_forward(model)
     )
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index c5bbd5497a146..60fbeb842dd4b 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -284,7 +284,7 @@ class InternLM2Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.tok_embeddings(input_ids)
 
     def forward(
@@ -298,7 +298,7 @@ class InternLM2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -330,11 +330,9 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        self.lora_config = lora_config
 
         self.model = model_type(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
@@ -352,8 +350,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 1f251935a70a9..c2195fd0cb88d 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -742,7 +742,7 @@ class InternS1ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -765,7 +765,7 @@ class InternS1ForConditionalGeneration(
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -778,9 +778,9 @@ class InternS1ForConditionalGeneration(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e2d2647f01777..ccbde115009d2 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1344,7 +1344,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -1367,7 +1367,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1380,9 +1380,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 782ab6f1e2da2..5549a1fc1cd30 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -275,7 +275,7 @@ class JAISModel(nn.Module):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -287,7 +287,7 @@ class JAISModel(nn.Module):
     ) -> IntermediateTensors | torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             if self.wpe is not None:
                 position_embeds = self.wpe(position_ids)
                 hidden_states = inputs_embeds + position_embeds
@@ -339,8 +339,8 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 0cb993901fd38..3a2c98c73dab4 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -307,21 +306,14 @@ class JambaModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
@@ -348,7 +340,7 @@ class JambaModel(nn.Module):
 
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -362,7 +354,7 @@ class JambaModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -492,7 +484,7 @@ class JambaForCausalLM(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -503,31 +495,21 @@ class JambaForCausalLM(
         self.model = JambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 42f16ad9f3b3a..1eb0eccc0411c 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -16,7 +16,7 @@ from transformers.feature_extraction_utils import BatchFeature
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     maybe_get_vit_flash_attn_backend,
 )
@@ -40,6 +40,7 @@ from vllm.multimodal.inputs import (
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -345,6 +346,13 @@ def apply_rotary_pos_emb_flashatt(
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
     elif current_platform.is_rocm():
         from flash_attn.ops.triton.rotary import apply_rotary as apply_rotary_emb
+    else:
+        # For other platforms, use PyTorch fallback
+        from vllm.model_executor.layers.rotary_embedding.common import (
+            apply_rotary_emb_torch,
+        )
+
+        apply_rotary_emb = partial(apply_rotary_emb_torch, is_neox_style=True)
 
     q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
     k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
@@ -360,7 +368,7 @@ class KeyeSiglipAttention(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -414,17 +422,17 @@ class KeyeSiglipAttention(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Keye-VL does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -489,7 +497,7 @@ class KeyeSiglipAttention(nn.Module):
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -536,7 +544,7 @@ class KeyeSiglipEncoderLayer(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -590,7 +598,7 @@ class KeyeSiglipEncoder(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -685,7 +693,7 @@ class KeyeSiglipVisionTransformer(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -768,7 +776,7 @@ class KeyeSiglipVisionModel(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -1476,9 +1484,7 @@ class BaseKeyeModule(nn.Module):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1627,16 +1633,17 @@ class KeyeForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
         if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
             video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
 
         def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
             """
@@ -1662,6 +1669,7 @@ class KeyeForConditionalGeneration(
 
         video_grid_thw = split_thw(video_grid_thw)
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
@@ -1691,20 +1699,12 @@ class KeyeForConditionalGeneration(
                 ed_video = len(input_tokens) + 1
 
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_index += 1
                 remain_frames -= 1
                 ed = ed_video
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 6f95a59d36d29..124e9c2afa217 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -21,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     ImageItem,
     ModalityData,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -597,16 +598,17 @@ class KeyeVL1_5ForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
         if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
             video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
 
         def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
             """
@@ -632,6 +634,7 @@ class KeyeVL1_5ForConditionalGeneration(
 
         video_grid_thw = split_thw(video_grid_thw)
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
@@ -661,20 +664,12 @@ class KeyeVL1_5ForConditionalGeneration(
                 ed_video = len(input_tokens) + 1
 
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_index += 1
                 remain_frames -= 1
                 ed = ed_video
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index cce22842d3330..f3675075a48f4 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -439,7 +439,7 @@ class KimiLinearModel(nn.Module):
             "num_attention_heads must be divisible by world_size"
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -454,7 +454,7 @@ class KimiLinearModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -504,8 +504,8 @@ class KimiLinearForCausalLM(
             self.config.vocab_size, scale=logit_scale
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index b54f53931d714..8167b82f32330 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -60,7 +60,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.model_loader.weight_utils import (
@@ -347,13 +346,10 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             vllm_config=sub_vllm_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        self.unpadded_vocab_size = config.text_config.vocab_size
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.text_config.hidden_size,
-                org_num_embeddings=self.config.text_config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         else:
@@ -362,9 +358,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             self.language_model.make_empty_intermediate_tensors
         )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.media_placeholder: int = self.config.media_placeholder_token_id
 
     def _parse_and_validate_image_input(
@@ -410,7 +404,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> NestedTensors | None:
+    def embed_multimodal(self, **kwargs: object) -> NestedTensors | None:
         # Validate the multimodal input keyword arguments
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
@@ -456,7 +450,11 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
-        if not config.use_mla:
+        use_mha = (
+            config.model_type == "deepseek"
+            or config.qk_nope_head_dim + config.qk_rope_head_dim == 0
+        )
+        if use_mha:
             stacked_params_mapping += [
                 (".qkv_proj", ".q_proj", "q"),
                 (".qkv_proj", ".k_proj", "k"),
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 5684b9a891257..aeb25602f11a4 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -316,16 +315,10 @@ class Lfm2Model(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
@@ -358,7 +351,7 @@ class Lfm2Model(nn.Module):
         else:
             self.embedding_norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -372,7 +365,7 @@ class Lfm2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -483,7 +476,7 @@ class Lfm2ForCausalLM(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         assert not cache_config.enable_prefix_caching, (
             "Lfm2 currently does not support prefix caching"
         )
@@ -495,21 +488,9 @@ class Lfm2ForCausalLM(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -517,16 +498,14 @@ class Lfm2ForCausalLM(
         else:
             self.lm_head = PPMissingLayer()
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 02a490e9c7fd9..6b7b5564ee989 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -33,7 +33,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -423,20 +422,15 @@ class Lfm2MoeModel(nn.Module):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
         enable_eplb = parallel_config.enable_eplb
         eplb_config = parallel_config.eplb_config
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
@@ -472,7 +466,7 @@ class Lfm2MoeModel(nn.Module):
         else:
             self.embedding_norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -486,7 +480,7 @@ class Lfm2MoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -662,7 +656,7 @@ class Lfm2MoeForCausalLM(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         assert not cache_config.enable_prefix_caching, (
             "Lfm2Moe currently does not support prefix caching"
         )
@@ -674,21 +668,9 @@ class Lfm2MoeForCausalLM(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -696,9 +678,7 @@ class Lfm2MoeForCausalLM(
         else:
             self.lm_head = PPMissingLayer()
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
@@ -734,8 +714,8 @@ class Lfm2MoeForCausalLM(
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def update_physical_experts_metadata(
         self,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 0a08bd376badc..c49a1ea817f91 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -424,7 +424,7 @@ class LlamaModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -438,7 +438,7 @@ class LlamaModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -640,8 +640,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     ):
         return LlamaModel(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index b59176191e7aa..e8716d652415e 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -82,7 +82,7 @@ class LlamaModel(nn.Module):
         )
         self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -93,7 +93,7 @@ class LlamaModel(nn.Module):
         inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
         hidden_states = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.layers:
@@ -195,7 +195,7 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    get_input_embeddings = SupportsMultiModal.get_input_embeddings  # type: ignore
+    embed_input_ids = SupportsMultiModal.embed_input_ids  # type: ignore
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 3617294bd621d..ab2a9f6f06dbe 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -84,7 +84,7 @@ class LlamaModel(nn.Module):
             self.config.hidden_size * 2, self.config.hidden_size, bias=False
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -158,8 +158,8 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
             self.config.vocab_size, scale=logit_scale
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index da4bbda186b17..6edc9519dfbbf 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import QKVParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -173,7 +172,7 @@ class LlamaModel(nn.Module):
             eps=self.config.rms_norm_eps,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -184,7 +183,7 @@ class LlamaModel(nn.Module):
         input_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if input_embeds is None:
-            input_embeds = self.get_input_embeddings(input_ids)
+            input_embeds = self.embed_input_ids(input_ids)
         assert hidden_states.shape[-1] == input_embeds.shape[-1]
 
         residual = None
@@ -252,8 +251,6 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         self.lm_head = ParallelLMHead(
             self.config.draft_vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.draft_vocab_size,
-            padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         self.logits_processor = LogitsProcessor(
@@ -264,13 +261,13 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
             requires_grad=False,
         )
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: NestedTensors | None = None,
         is_multimodal: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a3dea0ce86f8e..c1fb2d4f4af7d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -661,7 +661,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3cf546644d04a..98b1b46045c3d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -483,14 +483,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -501,9 +501,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 77c331b0182bd..902c598c226f0 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -422,7 +422,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return []
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c4cae240ea469..322bde94ff66d 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -866,7 +866,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5671347c00a23..5de10e7086830 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -498,7 +498,7 @@ class FlashModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -512,7 +512,7 @@ class FlashModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -554,7 +554,6 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         config.intermediate_size = (
@@ -562,7 +561,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             if hasattr(config, "ffn_hidden_size")
             else config.intermediate_size
         )
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = FlashModel(
@@ -584,8 +583,8 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index f684203f6d35e..aa16640a94276 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -110,18 +109,12 @@ class MambaModel(nn.Module):
         is_lora_enabled = bool(lora_config)
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -142,7 +135,7 @@ class MambaModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -156,7 +149,7 @@ class MambaModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -199,7 +192,7 @@ class MambaForCausalLM(
 ):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -209,34 +202,24 @@ class MambaForCausalLM(
         self.backbone = MambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         if config.tie_word_embeddings:
             self.lm_head = self.backbone.embeddings
         else:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 8ba8af66635b3..fc17f98be1986 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -107,18 +106,12 @@ class Mamba2Model(nn.Module):
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -138,7 +131,7 @@ class Mamba2Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -152,7 +145,7 @@ class Mamba2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -238,7 +231,7 @@ class Mamba2ForCausalLM(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -249,34 +242,23 @@ class Mamba2ForCausalLM(
         self.backbone = Mamba2Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 7e1d2bf14bb5c..fd7fc2c73f16e 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -9,7 +9,6 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -70,14 +69,11 @@ class Medusa(nn.Module):
         )
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
-        self.unpadded_vocab_size = self.truncated_vocab_size
 
         if getattr(config, "original_lm_head", False):
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                self.truncated_vocab_size,
                 config.hidden_size,
-                org_num_embeddings=self.truncated_vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.lm_heads = [self.lm_head for _ in range(self.config.num_heads)]
@@ -85,10 +81,8 @@ class Medusa(nn.Module):
             self.lm_heads = nn.ModuleList(
                 [
                     ParallelLMHead(
-                        self.unpadded_vocab_size,
+                        config.vocab_size,
                         config.hidden_size,
-                        org_num_embeddings=self.truncated_vocab_size,
-                        padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                         prefix=maybe_prefix(prefix, f"lm_heads.{i}"),
                     )
                     for i in range(self.config.num_heads)
@@ -97,7 +91,7 @@ class Medusa(nn.Module):
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.truncated_vocab_size, logit_scale
+            config.vocab_size, self.truncated_vocab_size, logit_scale
         )
 
         # Token map is a idx to token mapping to reduce the vocab size for
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index 322cce79d4cb2..a84c99059cd9c 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -791,7 +791,7 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.decoder
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
 
         if audio_input is None:
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 726752a77e0dc..cd0a6190e9502 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -70,7 +70,7 @@ class MiMoModel(Qwen2Model):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -151,10 +151,8 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
         nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index 3d7695a2a3042..9905f65b74ca7 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -120,7 +120,7 @@ class MiMoMultiTokenPredictor(nn.Module):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -164,8 +164,8 @@ class MiMoMTP(nn.Module):
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 85d3542317a1d..914b097fe199e 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -55,7 +55,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -405,22 +404,16 @@ class MiniCPMModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config)
@@ -447,7 +440,7 @@ class MiniCPMModel(nn.Module):
             prefix=f"{prefix}.layers",
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -462,7 +455,7 @@ class MiniCPMModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -588,13 +581,13 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
@@ -602,18 +595,9 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -621,7 +605,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -631,8 +615,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 463af9bbe1399..0ca31913485db 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -151,18 +150,13 @@ class EagleMiniCPMModel(nn.Module):
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.fc = torch.nn.Linear(
             self.config.hidden_size * 2, self.config.hidden_size, bias=False
         )
@@ -171,7 +165,6 @@ class EagleMiniCPMModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config, start_layer)
@@ -200,7 +193,7 @@ class EagleMiniCPMModel(nn.Module):
             ]
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -210,7 +203,7 @@ class EagleMiniCPMModel(nn.Module):
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor | IntermediateTensors:
-        input_embeds = self.get_input_embeddings(input_ids)
+        input_embeds = self.embed_input_ids(input_ids)
         input_embeds = self.input_norm1(input_embeds)
         hidden_states = self.input_norm2(hidden_states)
 
@@ -321,12 +314,11 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
@@ -340,18 +332,9 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             start_layer=target_layer_num,
         )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -359,7 +342,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -371,8 +354,8 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             vllm_config=vllm_config, prefix=prefix, start_layer=start_layer
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 09937706f8c5d..2ac97764dd341 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1139,7 +1139,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.llm
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 21ed428a05d0f..49d2f2d261969 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -360,7 +360,7 @@ class MiniMaxM2Model(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -374,7 +374,7 @@ class MiniMaxM2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -510,8 +510,8 @@ class MiniMaxM2ForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index e262012dcd526..bf1ecc822756d 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -621,7 +620,7 @@ class MiniMaxText01Model(nn.Module):
             )
             minimax_cache_tensors[:, slots_tensor, ...] = 0
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -669,16 +668,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
         if not hasattr(config, "sliding_window"):
             config.sliding_window = None
 
         self.CONCAT_FFN = True
 
-        self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
         self.model = MiniMaxText01Model(
@@ -686,15 +683,13 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         )
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 self.config.hidden_size,
-                org_num_embeddings=self.config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.config.vocab_size
+                config.vocab_size, self.config.vocab_size
             )
 
         else:
@@ -714,8 +709,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index fb7c6d42a0658..0939a72ba53ec 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -353,7 +353,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
 
         raise AssertionError("This line should be unreachable.")
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -371,8 +371,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 26d4deca2e120..1ddb470a0f93d 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -549,7 +549,7 @@ class Mistral3ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c1f411b6cd2ac..d7a1cb82fb4fb 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -345,7 +345,7 @@ class MixtralModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -359,7 +359,7 @@ class MixtralModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -591,8 +591,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 4548abde77d5f..14e741f322582 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -865,7 +865,7 @@ class Llama4ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 4901ac74fb28b..48604d8e51031 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -123,7 +123,6 @@ class MLPSpeculator(nn.Module):
                     VocabParallelEmbedding(
                         config.vocab_size,
                         self.inner_dim,
-                        org_num_embeddings=config.vocab_size,
                     )
                     for _ in range(self.max_speculative_tokens)
                 ]
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 5a0769f3bdaae..3a8a6c74d9d15 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -46,7 +46,7 @@ class ModernBertEmbeddings(nn.Module):
         )
         self.norm = nn.LayerNorm(config.hidden_size, eps=eps, bias=config.norm_bias)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.tok_embeddings(input_ids)
 
     def forward(
@@ -225,8 +225,8 @@ class ModernBertModel(nn.Module):
             config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embeddings.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.hf_to_vllm_mapper.apply(weights)
@@ -337,8 +337,8 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         self_weights = []
@@ -424,8 +424,8 @@ class ModernBertForTokenClassification(nn.Module):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self, skip_prefixes=["drop"])
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index dce94d181c4cd..ab83a271e30a0 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -832,7 +832,7 @@ class MolmoModel(nn.Module, SupportsQuant):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1404,10 +1404,9 @@ class MolmoForCausalLM(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.multimodal_config = multimodal_config
-        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config)
@@ -1492,7 +1491,7 @@ class MolmoForCausalLM(
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 29e887c4d9c98..106ad971a321a 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -248,7 +248,7 @@ class MPTModel(nn.Module):
             ["hidden_states"], config.d_model
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -262,7 +262,7 @@ class MPTModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -308,8 +308,8 @@ class MPTForCausalLM(nn.Module, SupportsPP):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 86fc1d6046cee..cb39c2ae482d2 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -655,7 +655,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         The replacement returned is not actually used to replace the placeholder
         tokens - it's just used to make sure we allocate the correct number
         of tokens.
-        Actual replacement is done in get_multimodal_embeddings of
+        Actual replacement is done in embed_multimodal of
         NemotronH_Nano_VL_V2
         (specifically in _process_video_input -> _create_final_video_embeddings).
         There, we create the final embeddings with text embeddings for indicator tokens
@@ -1401,7 +1401,7 @@ class NemotronH_Nano_VL_V2(
 
         # Create final video embeddings, merging text embeddings for indicator
         # tokens with video embeddings
-        text_embeddings = self.get_language_model().get_input_embeddings(repl_token_ids)
+        text_embeddings = self.get_language_model().embed_input_ids(repl_token_ids)
         final_video_embeddings = _merge_multimodal_embeddings(
             inputs_embeds=text_embeddings,
             multimodal_embeddings=video_embeddings,
@@ -1465,7 +1465,7 @@ class NemotronH_Nano_VL_V2(
 
         return modalities
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Validate the multimodal input keyword arguments
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if modalities is None:
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 845798b18d1b3..92dcf5ea57008 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -319,24 +318,18 @@ class NemotronModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
@@ -358,7 +351,7 @@ class NemotronModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -372,7 +365,7 @@ class NemotronModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -467,29 +460,20 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         assert isinstance(config, NemotronConfig)
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = NemotronModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -498,7 +482,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -507,8 +491,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index fb58d01be7ba1..f7e0caf410e10 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -50,7 +50,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -513,21 +512,14 @@ class NemotronHModel(nn.Module):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         parallel_config = vllm_config.parallel_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.has_moe = "E" in config.hybrid_override_pattern
@@ -556,7 +548,7 @@ class NemotronHModel(nn.Module):
 
         self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -570,7 +562,7 @@ class NemotronHModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -768,7 +760,7 @@ class NemotronHForCausalLM(
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         self.quant_config = vllm_config.quant_config
@@ -779,24 +771,14 @@ class NemotronHForCausalLM(
         self.model = NemotronHModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
@@ -841,8 +823,8 @@ class NemotronHForCausalLM(
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 17e009612df43..b839206a3094d 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -250,25 +249,19 @@ class DeciModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -298,7 +291,7 @@ class DeciModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -312,7 +305,7 @@ class DeciModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -437,29 +430,17 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -468,7 +449,7 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -480,8 +461,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return DeciModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 2f78e2f60c93b..5a1dda8aac2c1 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -561,7 +561,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -580,7 +580,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -593,9 +593,9 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 390a91d3425ce..487e3f671a455 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -268,7 +268,7 @@ class OlmoModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -285,7 +285,7 @@ class OlmoModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -368,11 +368,9 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -381,8 +379,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 7e39f6dff25e7..045582c889ee4 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -304,7 +304,7 @@ class Olmo2Model(nn.Module):
             ["hidden_states"], self.config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -408,11 +408,9 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=vllm_config.quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -421,8 +419,8 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 35a09334a1293..499eb05de76e4 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -296,7 +296,7 @@ class OlmoeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -310,7 +310,7 @@ class OlmoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -471,8 +471,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index bf1b7570a8828..d13a745beffeb 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -753,7 +753,7 @@ class OpenPanguModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -767,7 +767,7 @@ class OpenPanguModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -969,8 +969,8 @@ class OpenPanguModelBase(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py
index f4049f2d39705..436b7f981b1f9 100644
--- a/vllm/model_executor/models/openpangu_mtp.py
+++ b/vllm/model_executor/models/openpangu_mtp.py
@@ -100,8 +100,8 @@ class OpenPanguMTP(nn.Module, SupportsPP):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d124b7671b9cf..5df700d1a2e17 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -262,7 +262,7 @@ class OPTDecoder(nn.Module):
             prefix=f"{prefix}.layers",
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -274,7 +274,7 @@ class OPTDecoder(nn.Module):
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             pos_embeds = self.embed_positions(positions)
             if self.project_in is not None:
                 inputs_embeds, _ = self.project_in(inputs_embeds)
@@ -311,8 +311,8 @@ class OPTModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -394,8 +394,8 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index cbfce18b43885..859cd2cecf897 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -255,7 +255,7 @@ class OrionModel(nn.Module):
             config.hidden_size,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -269,7 +269,7 @@ class OrionModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -345,8 +345,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index b8dad909c5470..9db6c317c26a8 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -361,7 +361,7 @@ class OuroModel(nn.Module):
 
         self.total_ut_steps = getattr(self.config, "total_ut_steps", 4)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -374,7 +374,7 @@ class OuroModel(nn.Module):
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states = self.embed_input_ids(input_ids)
 
         for current_ut in range(self.total_ut_steps):
             residual = None
@@ -462,10 +462,8 @@ class OuroForCausalLM(nn.Module, SupportsLoRA):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = OuroModel(
@@ -488,8 +486,8 @@ class OuroForCausalLM(nn.Module, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index cc6c9b4e72d76..a0fab820720fb 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -514,7 +514,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
         return tuple(vision_embeddings)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index f6461ae9a412e..85f37cfea10b1 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -106,7 +106,7 @@ class VisualTokenizer(torch.nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -135,7 +135,7 @@ class VisualTokenizer(torch.nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         model_type = config.model_type
         if model_type == "siglip2_navit":
@@ -617,7 +617,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
         return modalities
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 631475c964c0b..183f458658aa3 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -31,7 +31,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -61,6 +61,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargs,
 )
@@ -198,23 +199,18 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
         if image_processor is None:
             image_processor = self.get_image_processor()
 
-        do_resize = True
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
-
-        if do_resize:
-            resized_height, resized_width = smart_resize(
-                height=image_height,
-                width=image_width,
-                factor=patch_size * merge_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
-            )
-            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
-        else:
-            preprocessed_size = ImageSize(width=image_width, height=image_height)
+        resized_height, resized_width = smart_resize(
+            height=image_height,
+            width=image_width,
+            factor=patch_size * merge_size,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+        )
+        preprocessed_size = ImageSize(width=resized_width, height=resized_height)
 
         grid_t = 1
         grid_h = preprocessed_size.height // patch_size
@@ -227,8 +223,19 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self.get_hf_config()
-        image_size = hf_config.vision_config.image_size
-        return ImageSize(height=image_size, width=image_size)
+
+        # See `smart_resize` for the calculation of the image size.
+        merge_size = hf_config.vision_config.spatial_merge_size
+        patch_size = hf_config.vision_config.patch_size
+        factor = merge_size * patch_size
+        max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
+        # Find factors of max_num_tokens close to its square root
+        # to create a dummy image with a reasonable aspect ratio.
+        h_patches = int(math.sqrt(max_num_tokens))
+        while max_num_tokens % h_patches != 0:
+            h_patches -= 1
+        w_patches = max_num_tokens // h_patches
+        return ImageSize(height=h_patches * factor, width=w_patches * factor)
 
 
 class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
@@ -574,8 +581,8 @@ class SiglipAttention(nn.Module):
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
-        attn_backend_override: _Backend | None = None,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
         use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
@@ -615,8 +622,8 @@ class SiglipAttention(nn.Module):
             )
         )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -674,10 +681,10 @@ class SiglipAttention(nn.Module):
                 cu_seqlens,
                 max_seqlen,
                 batch_size,
-                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
                 self.use_upstream_fa,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 start_idx = cu_seqlens[i - 1]
@@ -696,7 +703,7 @@ class SiglipAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             if seqlens is None:
                 raise ValueError("xFormers attention backend requires seqlens tensor.")
             context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
@@ -780,8 +787,8 @@ class SiglipEncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
-        attn_backend_override: _Backend | None = None,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
         use_upstream_fa: bool = False,
     ):
         super().__init__()
@@ -841,7 +848,7 @@ class SiglipEncoder(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -855,16 +862,16 @@ class SiglipEncoder(nn.Module):
         )
         self.use_upstream_fa = False
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         } and check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
             self.use_upstream_fa = True
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -937,9 +944,12 @@ class SiglipEncoder(nn.Module):
 
         max_seqlen = None
         seqlens = None
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
@@ -960,7 +970,7 @@ class SiglipVisionTransformer(nn.Module):
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -1010,7 +1020,7 @@ class SiglipVisionModel(nn.Module):
         config,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -1175,15 +1185,17 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float],
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1220,20 +1232,12 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1324,10 +1328,10 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
             inputs_embeds = None
 
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            vision_embeddings = self.embed_multimodal(**kwargs)
             is_multimodal = kwargs.pop("is_multimodal", None)
             handle_oov_mm_token = kwargs.pop("handle_oov_mm_token", False)
-            inputs_embeds = self.get_input_embeddings(
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=is_multimodal,
@@ -1387,7 +1391,7 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
         image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return ()
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index fb0b4b2904675..ec5d0fa6226dd 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -375,7 +375,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 37a7108d5c013..3bf6a1d9763d0 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -270,7 +270,7 @@ class PersimmonModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -284,7 +284,7 @@ class PersimmonModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -347,8 +347,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 34db124b6447c..8fee53c23fb4b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -240,7 +240,7 @@ class PhiModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -254,7 +254,7 @@ class PhiModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -323,11 +323,10 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # lm_head use bias, cannot share word embeddings
         assert not config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
@@ -347,8 +346,8 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b86fe67fb4768..384572217bc19 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -591,7 +591,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
@@ -665,14 +664,14 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -680,7 +679,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
             self.embed_tokens,
             is_multimodal=is_multimodal,
@@ -692,7 +691,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 4799b7aba7f76..0f1230a55bae6 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -1371,7 +1371,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             )
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index acad72b058fcd..8425549a7bd20 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -21,7 +21,6 @@ from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.models.llama import LlamaModel
@@ -1023,12 +1022,10 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         multimodal_config = vllm_config.model_config.multimodal_config
         assert multimodal_config, "multimodal_config is required"
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.lora_config = lora_config
 
         # Tensor/Pipeline parallel not supported for now.
         assert get_pp_group().world_size == 1, "pipeline parallel is not supported"
@@ -1055,23 +1052,16 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
@@ -1190,7 +1180,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         )
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index c7436cedeb229..92fd858b608bc 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -458,22 +457,15 @@ class PhiMoEModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+        self.vocab_size = config.vocab_size
+
         self.config = config
         self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -490,7 +482,7 @@ class PhiMoEModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -504,7 +496,7 @@ class PhiMoEModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -634,42 +626,30 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = vllm_config.quant_config
 
         self.model = PhiMoEModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=(
-                DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size
-            ),
             quant_config=None,
             bias=True,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 0555717017cdc..8cb7d6a889da4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -461,7 +461,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -1109,7 +1109,7 @@ class PixtralHFAttention(nn.Module):
             )
             out = out.transpose(1, 2)
 
-        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
         attn_output, _ = self.o_proj(out)
 
         return attn_output, None
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 6427ccfccc134..0c87f5000ff45 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -751,12 +750,10 @@ class Plamo2Model(torch.nn.Module):
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             prefix=f"{prefix}.embed_tokens",
         )
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
@@ -765,7 +762,7 @@ class Plamo2Model(torch.nn.Module):
         self.layers = Plamo2Decoder(vllm_config=vllm_config, prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -779,7 +776,7 @@ class Plamo2Model(torch.nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -827,27 +824,23 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.vocab_size = self.config.vocab_size
-        self.unpadded_vocab_size = self.config.vocab_size
-        num_embeddings = ((self.vocab_size + 15) // 16) * 16
         self.lm_head = ParallelLMHead(
-            num_embeddings,
+            self.vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             prefix=f"{prefix}.lm_head",
         )
         if self.config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.config.vocab_size
+            config.vocab_size, self.config.vocab_size
         )
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index c99f628004fbd..50a125c3f5973 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -221,7 +221,7 @@ class QWenModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -235,7 +235,7 @@ class QWenModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b26546647ce76..1bbb969ce5aa3 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -355,7 +355,7 @@ class Qwen2Model(nn.Module):
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -369,7 +369,7 @@ class Qwen2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -477,10 +477,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen2Model(
@@ -506,8 +504,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index fac281d2caf49..262ea771d9cdf 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -68,6 +68,7 @@ from vllm.multimodal.inputs import (
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     NestedTensors,
@@ -923,21 +924,9 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value (Qwen2.5-Omni version).
-
-        Differences from MRotaryEmbedding:
-            1. Add audio support (and related `audio_feature_lengths`).
-            2. Add `use_audio_in_video` option to read audio from video inputs.
-                In this case, audio and vision position ids will be split into
-                chunks and interleaved.
-
+        """
         Example:
 
             (V_i are vision position ids, A_i are audio position ids)
@@ -945,11 +934,33 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
             |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
         """
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
 
         # TODO(fyabc): refactor and share more code with
         #  _vl_get_input_positions_tensor.
 
-        thinker_config = hf_config.thinker_config
+        thinker_config = self.config
         audio_token_id = thinker_config.audio_token_index
         image_token_id = thinker_config.image_token_index
         video_token_id = thinker_config.video_token_index
@@ -963,11 +974,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             thinker_config.vision_config, "tokens_per_second", 25
         )
 
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
-
         src_item = input_tokens
         audio_seqlens = audio_feature_lengths
         if not second_per_grid_ts:
@@ -1126,7 +1132,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
 
         return llm_positions, mrope_position_delta
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1152,7 +1158,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
 
     # TODO (ywang96): support overlapping modality embeddings so that
     # `use_audio_in_video` will work on V1.
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1162,16 +1168,16 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings_v0(self, **kwargs: object) -> NestedTensors | None:
+    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         image_input = self._parse_and_validate_image_input(**kwargs)
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 48834ba699e4c..23591480b160e 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -35,14 +35,14 @@ import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
     Qwen2_5_VLVisionConfig,
 )
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
@@ -75,7 +75,11 @@ from vllm.multimodal.evs import (
     compute_retention_mask,
     recompute_mrope_positions,
 )
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
@@ -287,25 +291,6 @@ class Qwen2_5_VisionMLP(nn.Module):
         return x_down
 
 
-def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
-    """All-gather the input tensor interleavely across model parallel group."""
-    import torch.distributed as dist
-
-    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    dist.all_gather(
-        gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group
-    )
-
-    gathered_tensors_split = [
-        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
-    ]
-    ordered_tensors = [
-        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
-    ]
-    result_tensor = torch.cat(ordered_tensors, dim=-1)
-    return result_tensor
-
-
 class Qwen2_5_VisionAttention(nn.Module):
     def __init__(
         self,
@@ -315,9 +300,9 @@ class Qwen2_5_VisionAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -364,33 +349,25 @@ class Qwen2_5_VisionAttention(nn.Module):
         # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
         from vllm.platforms import current_platform
 
-        if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
             self.use_upstream_fa = True
         if current_platform.is_xpu():
             self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = all_gather_interleave(qkv, self.qkv.hidden_size, self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(
-                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
-            )
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
         new_shape = (
             seq_len,
@@ -431,10 +408,10 @@ class Qwen2_5_VisionAttention(nn.Module):
                 cu_seqlens,
                 max_seqlen,
                 batch_size,
-                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
                 self.use_upstream_fa,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             from vllm.platforms import current_platform
 
@@ -450,7 +427,7 @@ class Qwen2_5_VisionAttention(nn.Module):
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
@@ -478,9 +455,9 @@ class Qwen2_5_VisionBlock(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -656,7 +633,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -708,10 +685,10 @@ class Qwen2_5_VisionTransformer(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
@@ -850,9 +827,12 @@ class Qwen2_5_VisionTransformer(nn.Module):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -1114,15 +1094,17 @@ class Qwen2_5_VLForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float],
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1159,20 +1141,12 @@ class Qwen2_5_VLForConditionalGeneration(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1560,7 +1534,7 @@ class Qwen2_5_VLForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 4de6a19c1ff0c..7e883a393aa8d 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -439,7 +439,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, Supports
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index c03bd6a3c6d74..2ff0d19df238c 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -389,7 +389,7 @@ class Qwen2MoeModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -403,7 +403,7 @@ class Qwen2MoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -566,8 +566,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index e2ba0e262cf79..eac46e0f8b055 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -43,10 +43,8 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen2Model(
@@ -75,8 +73,8 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b3999e6c934e3..13b54bbe17488 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -34,7 +34,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
     Qwen2VLConfig,
@@ -43,14 +43,14 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
@@ -70,6 +70,7 @@ from vllm.multimodal.inputs import (
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -329,7 +330,7 @@ class Qwen2VisionAttention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -378,38 +379,27 @@ class Qwen2VisionAttention(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen2-VL does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(
-                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
-            )
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
         new_shape = (
             seq_len,
@@ -460,7 +450,7 @@ class Qwen2VisionAttention(nn.Module):
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             from vllm.platforms import current_platform
 
@@ -485,7 +475,7 @@ class Qwen2VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -515,7 +505,7 @@ class Qwen2VisionBlock(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -679,7 +669,7 @@ class Qwen2VisionTransformer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -739,10 +729,11 @@ class Qwen2VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -789,9 +780,12 @@ class Qwen2VisionTransformer(nn.Module):
         self, cu_seqlens: torch.Tensor
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -1236,21 +1230,17 @@ class Qwen2VLForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get M-RoPE input positions for Qwen2-VL model."""
-        if image_grid_thw is None:
-            image_grid_thw = []
-        if video_grid_thw is None:
-            video_grid_thw = []
-        if second_per_grid_ts is None:
-            second_per_grid_ts = []
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1287,20 +1277,12 @@ class Qwen2VLForConditionalGeneration(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1525,7 +1507,7 @@ class Qwen2VLForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 563d3cc23d726..8d7f22a33fe6c 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -272,10 +272,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen3Model(
@@ -308,8 +306,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index d57b82cb02273..96751fee800bb 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -427,7 +427,7 @@ class Qwen3MoeModel(nn.Module):
         # Track layers for auxiliary hidden state outputs (EAGLE3)
         self.aux_hidden_state_layers: tuple[int, ...] = ()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -441,7 +441,7 @@ class Qwen3MoeModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -714,8 +714,8 @@ class Qwen3MoeForCausalLM(
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index aa7de5aa5f29c..86508a7c64317 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -59,7 +59,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -587,7 +586,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                 self.conv1d.bias,
                 self.activation,
                 conv_state_indices=non_spec_state_indices_tensor[
-                    : attn_metadata.num_decodes
+                    : attn_metadata.num_actual_tokens
                 ],
                 validate_data=True,
             )
@@ -967,22 +966,17 @@ class Qwen3NextModel(nn.Module):
 
         config: Qwen3NextConfig = vllm_config.model_config.hf_config
         parallel_config = vllm_config.parallel_config
-        lora_config = vllm_config.lora_config
+
         eplb_config = parallel_config.eplb_config
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
@@ -1004,7 +998,7 @@ class Qwen3NextModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1018,7 +1012,7 @@ class Qwen3NextModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1196,7 +1190,7 @@ class Qwen3NextForCausalLM(
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, (
             "Qwen3Next currently does not support prefix caching"
@@ -1209,23 +1203,13 @@ class Qwen3NextForCausalLM(
         self.model = Qwen3NextModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -1233,8 +1217,8 @@ class Qwen3NextForCausalLM(
         # Set MoE hyperparameters
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index 271b76adcff7e..83694caa52480 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -15,7 +15,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -48,17 +47,12 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
 
         model_config = vllm_config.model_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         config: Qwen3NextConfig = model_config.hf_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
@@ -66,7 +60,6 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.fc = ColumnParallelLinear(
@@ -100,7 +93,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
             config.hidden_size, eps=config.rms_norm_eps
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -114,7 +107,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
             inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
             hidden_states = self.pre_fc_norm_hidden(hidden_states)
@@ -252,24 +245,20 @@ class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts):
         self.model = Qwen3NextMultiTokenPredictor(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp")
         )
-        self.unpadded_vocab_size = config.vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index da489a812f55d..5df2372a842cf 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -47,7 +47,7 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
 )
 from transformers.models.whisper import WhisperFeatureExtractor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -65,7 +65,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargsItems
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
@@ -301,7 +301,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -377,10 +377,11 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -490,9 +491,9 @@ class Qwen3Omni_VisionTransformer(nn.Module):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -612,7 +613,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1251,9 +1252,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1277,7 +1276,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
                 multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1285,9 +1284,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.language_model.get_input_embeddings,
+            self.language_model.embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -1413,39 +1412,48 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        config = hf_config.thinker_config
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
+
         input_ids = torch.tensor(input_tokens)
         if input_ids is None or input_ids.ndim != 1:
             raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
 
         seq_len = input_ids.shape[0]
-        if audio_feature_lengths is not None and not isinstance(
-            audio_feature_lengths, torch.Tensor
-        ):
-            audio_feature_lengths = torch.as_tensor(
+
+        if isinstance(audio_feature_lengths, list):
+            audio_feature_lengths = torch.tensor(
                 audio_feature_lengths, dtype=torch.long
             )
-        if second_per_grid_ts is None:
-            if video_grid_thw is not None and video_grid_thw.numel() > 0:
-                second_per_grids = torch.ones(
-                    video_grid_thw.shape[0], dtype=torch.float32
-                )
-            else:
-                second_per_grids = torch.tensor([], dtype=torch.float32)
+
+        if not len(second_per_grid_ts) and len(video_grid_thw):
+            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 
+        config = self.config
         spatial_merge_size = config.vision_config.spatial_merge_size
         image_token_id = config.image_token_id
         video_token_id = config.video_token_id
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index fe0124ef3258b..5f5bde1dd72d3 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -34,7 +34,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
     smart_resize as image_smart_resize,
@@ -49,7 +49,7 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
 )
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
@@ -198,7 +199,7 @@ class Qwen3_VisionBlock(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
@@ -306,7 +307,7 @@ class Qwen3_VisionTransformer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -372,18 +373,18 @@ class Qwen3_VisionTransformer(nn.Module):
         )
         use_upstream_fa = False
         if (
-            self.attn_backend != _Backend.FLASH_ATTN
-            and self.attn_backend != _Backend.ROCM_AITER_FA
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
             and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen3-VL does not support {self.attn_backend} backend now."
@@ -491,8 +492,8 @@ class Qwen3_VisionTransformer(nn.Module):
             weights = weights.to(dtype=self.dtype)
 
             embeds = self.pos_embed(indices)
-            weighted_embeds = embeds * weights
-            combined = weighted_embeds.sum(dim=0)
+            embeds *= weights
+            combined = embeds.sum(dim=0)
 
             combined = combined.reshape(
                 h // m_size, m_size, w // m_size, m_size, hidden_dim
@@ -510,11 +511,11 @@ class Qwen3_VisionTransformer(nn.Module):
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -1099,7 +1100,7 @@ class Qwen3LLMModel(Qwen3Model):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1135,10 +1136,8 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
         super(Qwen3ForCausalLM, self).__init__()
         config = vllm_config.model_config.hf_config.text_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
@@ -1416,29 +1415,28 @@ class Qwen3VLForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
         video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
 
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
+        input_tokens_array = np.array(input_tokens)
+        vision_start_mask = input_tokens_array == vision_start_token_id
+        vision_tokens = input_tokens_array[vision_start_mask.nonzero()[0] + 1]
+        image_nums = np.count_nonzero(vision_tokens == image_token_id)
+        video_nums = np.count_nonzero(vision_tokens == video_token_id)
         llm_pos_ids_list: list = []
 
         st = 0
@@ -1455,20 +1453,12 @@ class Qwen3VLForConditionalGeneration(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_index += 1
                 remain_videos -= 1
                 ed = ed_video
@@ -1482,50 +1472,28 @@ class Qwen3VLForConditionalGeneration(
 
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-            t_index = (
-                torch.arange(llm_grid_t)
-                .view(-1, 1)
-                .expand(-1, llm_grid_h * llm_grid_w)
-                .flatten()
-            )
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-            )
+            grid_indices = np.indices((llm_grid_t, llm_grid_h, llm_grid_w))
+            llm_pos_ids_list.append(grid_indices.reshape(3, -1) + text_len + st_idx)
             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
 
         if st < len(input_tokens):
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             text_len = len(input_tokens) - st
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-
-        return llm_positions, mrope_position_delta
+        return torch.from_numpy(llm_positions), mrope_position_delta
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1587,7 +1555,7 @@ class Qwen3VLForConditionalGeneration(
 
         return deepstack_input_embeds, multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1595,9 +1563,9 @@ class Qwen3VLForConditionalGeneration(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.language_model.get_input_embeddings,
+            self.language_model.embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -1607,7 +1575,7 @@ class Qwen3VLForConditionalGeneration(
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 284b1301d07fa..5c3205faf9c2f 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -97,7 +97,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index cf74f72fe633d..6a259cade9cf1 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -777,7 +777,7 @@ class QwenVLForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index cfccb904f46c9..31cc645099141 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -220,8 +220,8 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.jina_to_vllm_mapper)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.roberta.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.roberta.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 641160295afb3..bf211d28f1844 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -334,7 +334,7 @@ class SeedOssModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -348,7 +348,7 @@ class SeedOssModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -440,10 +440,8 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = SeedOssModel(
@@ -469,8 +467,8 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 3cbdd64acc4a9..b175dd60cf650 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -595,7 +595,7 @@ class SiglipTextTransformer(nn.Module):
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.head = nn.Linear(embed_dim, config.projection_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.token_embedding(input_ids)
 
     def forward(
@@ -1117,7 +1117,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1130,16 +1130,16 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
         )
 
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index bab5c1d82deda..29dd164ad37fd 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -12,7 +12,7 @@ from torch.nn import functional as F
 from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -25,6 +25,7 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.platforms import current_platform
 
 from .vision import get_vit_attn_backend
 
@@ -188,7 +189,7 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend:
+    if is_flash_attn_backend and not current_platform.is_xpu():
         from flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
@@ -208,7 +209,7 @@ class Siglip2Attention(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -264,14 +265,14 @@ class Siglip2Attention(nn.Module):
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
-            self.attn_backend = _Backend.TORCH_SDPA
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -306,9 +307,15 @@ class Siglip2Attention(nn.Module):
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         if self.is_flash_attn_backend:
             attn_output = self.flash_attn_varlen_func(
-                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen
+                queries,
+                keys,
+                values,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
             ).reshape(seq_length, -1)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             batch_size = cu_seqlens.shape[0] - 1
             outputs = []
@@ -376,7 +383,7 @@ class Siglip2EncoderLayer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -440,7 +447,7 @@ class Siglip2Encoder(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -626,7 +633,7 @@ class Siglip2VisionTransformer(nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -667,7 +674,7 @@ class Siglip2NavitModel(torch.nn.Module):
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 44550ae595d13..d825eb3a1c134 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -872,14 +872,14 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -892,9 +892,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index f0dfce7bc7b64..4ec855f794446 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -277,24 +276,18 @@ class SolarModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
@@ -317,7 +310,7 @@ class SolarModel(nn.Module):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -331,7 +324,7 @@ class SolarModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -455,9 +448,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = SolarModel(
@@ -465,18 +458,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -485,7 +469,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -494,8 +478,8 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a4e309e0aa6ba..06eb7201c1a89 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -246,7 +246,7 @@ class StableLMEpochModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -260,7 +260,7 @@ class StableLMEpochModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -332,8 +332,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index d147237808c2a..0f2942acd5006 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -250,7 +249,7 @@ class Starcoder2Model(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -264,7 +263,7 @@ class Starcoder2Model(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -319,28 +318,23 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=f"{prefix}.lm_head",
             )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index a2a1bfd30d8d8..4fff356b29e28 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -355,7 +354,7 @@ class Step3TextModel(nn.Module):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -369,7 +368,7 @@ class Step3TextModel(nn.Module):
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -400,28 +399,19 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.vllm_config = vllm_config
 
         self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-            self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size
-            )
+            self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
             self.lm_head = PPMissingLayer()
 
@@ -429,8 +419,8 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index dbb549ba3f985..5d16be1eb3128 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -1075,14 +1075,14 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1093,9 +1093,9 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
@@ -1113,8 +1113,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_id,
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index bfa1b5bbaf84f..4d310712f303e 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -576,7 +576,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -593,8 +593,8 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 4dfeddb0b28e4..8a0bec9dff848 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -57,7 +57,7 @@ class TeleFLMModel(LlamaModel):
         if self.use_mup:
             self.input_mult = self.config.input_mult
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         if self.use_mup:
             embedding = embedding * self.input_mult
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index e799e41e2c387..19052c8d49e44 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -251,7 +251,7 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
 
         self.pooler = DispatchPooler({"plugin": DummyPooler()})
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index eb992f7bec72b..63096e57f8eee 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -385,7 +385,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
 
         _init_parameters(module, dtype)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings()(input_ids)
         if self.embed_scale is not None:
             inputs_embeds *= self.embed_scale
@@ -416,7 +416,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
             and input_ids is not None
             and inputs_embeds is None
         ):
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
             input_ids = None
 
         if self.model_config.uses_mrope:
diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py
index 7f7b15a5675a3..42fd11117c737 100644
--- a/vllm/model_executor/models/transformers/causal.py
+++ b/vllm/model_executor/models/transformers/causal.py
@@ -42,7 +42,6 @@ class CausalMixin(VllmModelForTextGeneration):
             self.skip_prefixes.append("lm_head.")
 
         if self.pp_group.is_last_rank:
-            self.unpadded_vocab_size = self.text_config.vocab_size
             self.lm_head = ParallelLMHead(
                 self.text_config.vocab_size,
                 self.text_config.hidden_size,
@@ -56,7 +55,7 @@ class CausalMixin(VllmModelForTextGeneration):
 
             logit_scale = getattr(self.text_config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.text_config.vocab_size, logit_scale
+                self.text_config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 476074542e6ae..9b0463f41fa87 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -27,6 +27,7 @@ from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MultiModalKwargsItems
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalInputs,
     MultiModalUUIDDict,
@@ -38,7 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
-    from transformers import BatchFeature, PretrainedConfig
+    from transformers import BatchFeature
 
     from vllm.config import VllmConfig
     from vllm.config.multimodal import BaseDummyOptions
@@ -329,7 +330,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
 
         return LanguageModel(self)
 
-    def get_multimodal_embeddings(self, **kwargs):
+    def embed_multimodal(self, **kwargs):
         pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
         image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
         # Model might use `image_patches` instead of `pixel_values`
@@ -367,20 +368,34 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: "PretrainedConfig",
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        if any((second_per_grid_ts, audio_feature_lengths, use_audio_in_video)):
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        if any(
+            v
+            for k, v in kwargs.items()
+            if k not in {"image_grid_thw", "video_grid_thw"}
+        ):
             raise NotImplementedError("Transformers backend only supports images.")
 
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
 
         mrope_positions, mrope_position_delta = self.model.get_rope_index(
             input_ids=torch.tensor(input_tokens).unsqueeze(0),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 95d574fb81d7a..bb0f6bd036f14 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -579,14 +579,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
         audio_embeddings = self._process_audio_input(audio_input)
         return audio_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -597,9 +597,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index e5ebd8138b0ac..f14b79f2886c4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -474,7 +474,7 @@ def _merge_multimodal_embeddings(
 
 @deprecated(
     "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.get_input_embeddings` and will be "
+    "`SupportsMultiModal.embed_input_ids` and will be "
     "removed in v0.12."
 )
 def merge_multimodal_embeddings(
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9f94387c700d6..0e814e5c86ad4 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -10,7 +10,7 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
 import torch
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
@@ -83,8 +83,8 @@ def get_vit_attn_backend(
     head_size: int,
     dtype: torch.dtype,
     *,
-    attn_backend_override: _Backend | None = None,
-) -> _Backend:
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> AttentionBackendEnum:
     """
     Get the available attention backend for Vision Transformer.
     """
@@ -94,7 +94,7 @@ def get_vit_attn_backend(
     # Lazy import to avoid circular dependency
     from vllm.attention.selector import get_env_variable_attn_backend
 
-    selected_backend: _Backend | None = get_env_variable_attn_backend()
+    selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend()
     if selected_backend is not None:
         return selected_backend
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index cce18984b67e4..18ad8851fccda 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -399,7 +399,7 @@ class VoxtralForConditionalGeneration(
 
         return hidden_states
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self, **kwargs
     ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
         audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index ccfe1871ef075..91a10b95a08c0 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -13,7 +13,6 @@ from transformers import (
     BatchFeature,
     WhisperConfig,
     WhisperFeatureExtractor,
-    WhisperProcessor,
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
@@ -571,7 +570,7 @@ class WhisperDecoder(nn.Module):
         positions: torch.Tensor,
         encoder_hidden_states: torch.Tensor | None,
     ):
-        inputs_embeds = self.get_input_embeddings(input_ids)
+        inputs_embeds = self.embed_input_ids(input_ids)
         positions = self.embed_positions(positions)
         hidden_states = inputs_embeds + positions
 
@@ -584,7 +583,7 @@ class WhisperDecoder(nn.Module):
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
 
@@ -660,16 +659,6 @@ class WhisperProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> WhisperConfig:
         return self.ctx.get_hf_config(WhisperConfig)
 
-    def get_hf_processor(self, **kwargs: object) -> WhisperProcessor:
-        # HACK: Transformers 4.53.2 has issue with whisper tokenizer to
-        # initialize processor. We use a monkeypatch to fix it here.
-        # See: https://github.com/vllm-project/vllm/issues/20224
-        processor_class = WhisperProcessor
-        tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
-        if processor_class.tokenizer_class != tokenizer_class:
-            processor_class.tokenizer_class = tokenizer_class
-        return self.ctx.get_hf_processor(processor_class, **kwargs)
-
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
@@ -890,7 +879,7 @@ class WhisperForConditionalGeneration(
         self.dtype = vllm_config.model_config.dtype
 
         self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
-        self.unpadded_vocab_size = config.vocab_size
+
         self.proj_out = ParallelLMHead(
             config.vocab_size,
             config.d_model,
@@ -899,9 +888,7 @@ class WhisperForConditionalGeneration(
         )
         self.proj_out = self.proj_out.tie_weights(self.model.decoder.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def forward(
         self,
@@ -920,12 +907,12 @@ class WhisperForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.model.decoder
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         return [self.model.get_encoder_outputs(audio_input["input_features"])]
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -935,7 +922,7 @@ class WhisperForConditionalGeneration(
     ) -> torch.Tensor:
         # This method just returns the decoder sequence embeddings since
         # Whisper does not have encoder text tokens.
-        return self.model.decoder.get_input_embeddings(input_ids)
+        return self.model.decoder.embed_input_ids(input_ids)
 
     def _parse_and_validate_audio_input(self, **kwargs: object) -> WhisperAudioInputs:
         input_features = kwargs.pop("input_features", None)
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index bc1351600a2f4..64e6979c8fcfb 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -38,7 +38,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -692,19 +691,13 @@ class Zamba2Model(nn.Module):
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         # Initialize token embeddings
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         # Map hybrid layer indices to block indices
@@ -763,7 +756,7 @@ class Zamba2Model(nn.Module):
         # Final layer normalization
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
 
         Args:
@@ -793,7 +786,7 @@ class Zamba2Model(nn.Module):
         """
         # Handle pipeline parallelism for first rank
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
         hidden_states = inputs_embeds
 
         # Process through layers
@@ -911,7 +904,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
                 (not supported by Mamba)
         """
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -919,9 +912,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
         self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
         self.model_config = vllm_config.model_config
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         # Initialize core model
         self.model = Zamba2Model(
@@ -930,32 +920,24 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
 
         # Initialize language modeling head
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         # Tie weights with input embeddings if using same dimensions
         self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
         # Initialize logits processing and sampling
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
         Args:
             input_ids: Tensor of input token IDs
         Returns:
             Embedded representation of the input tokens
         """
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 28792338f036f..95f5982bc8c7b 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -49,10 +49,18 @@ def kernel_warmup(worker: "Worker"):
         except NotImplementedError:
             return False
 
-    if not worker.model_runner.is_pooling_model and all(
-        _is_flashinfer_backend(group.backend)
-        for groups in worker.model_runner.attn_groups
-        for group in groups
+    # NOTE: we add check for empty attn_groups to avoid errors when
+    # deploying models such as E instances and encoder-only models.
+    # As for those models, worker.model_runner.attn_groups is empty.
+    # This change is made during EPD feature development.
+    if (
+        not worker.model_runner.is_pooling_model
+        and worker.model_runner.attn_groups
+        and all(
+            _is_flashinfer_backend(group.backend)
+            for groups in worker.model_runner.attn_groups
+            for group in groups
+        )
     ):
         logger.info("Warming up FlashInfer attention.")
         # Warmup with mixed batch containing both prefill and decode tokens
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index a05f54191f044..7518a023c5f50 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -249,6 +249,19 @@ class MultiModalFeatureSpec:
     mm_position: PlaceholderRange
     """e.g., PlaceholderRange(offset=2, length=336)"""
 
+    @staticmethod
+    def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
+        kwargs = defaultdict[str, list[NestedTensors]](list)
+
+        for f in features:
+            item = f.data
+            if item is not None:
+                for k in keys:
+                    if k in item:
+                        kwargs[k].append(item[k].data)
+
+        return dict(kwargs)
+
 
 @dataclass
 class MultiModalFieldElem:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 55132a6036efb..85a03efd5bb9b 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -208,7 +208,7 @@ class PromptUpdateDetails(Generic[_S]):
     `None` (default) means to assign embeddings to all positions of `full`.
 
     The embeddings are obtained by calling
-    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
+    [`SupportsMultiModal.embed_multimodal`][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal].
     """
 
     @staticmethod
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index badf72de4a90f..a45ca988200d2 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -38,7 +38,7 @@ def tpu_platform_plugin() -> str | None:
     # Check for Pathways TPU proxy
     if envs.VLLM_TPU_USING_PATHWAYS:
         logger.debug("Confirmed TPU platform is available via Pathways proxy.")
-        return "tpu_inference.platforms.tpu_jax.TpuPlatform"
+        return "tpu_inference.platforms.tpu_platform.TpuPlatform"
 
     # Check for libtpu installation
     try:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ee904535ffe8d..8b3b8d4cb44fc 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -8,7 +8,6 @@ import platform
 import subprocess
 import sys
 from dataclasses import dataclass
-from importlib.util import find_spec
 from typing import TYPE_CHECKING
 
 import regex as re
@@ -23,10 +22,10 @@ from .interface import CpuArchEnum, Platform, PlatformEnum
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    _Backend = None
+    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -127,28 +126,24 @@ class CpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+        if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
             raise NotImplementedError("MLA is not supported on CPU.")
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on CPU.")
-        logger.info("Using Torch SDPA backend.")
-        if not use_v1:
-            raise ValueError("CPU backend only supports V1.")
-        return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        return AttentionBackendEnum.CPU_ATTN.get_path()
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -186,15 +181,13 @@ class CpuPlatform(Platform):
 
         cache_config = vllm_config.cache_config
 
-        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+        if cache_config.block_size is None:
+            cache_config.block_size = 128
 
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128 if ipex_available else 16
-
-        if not ipex_available and cache_config.block_size != 16:
-            raise RuntimeError(
-                f"--block-size={cache_config.block_size} requires"
-                " intel_extension_for_pytorch"
+        if cache_config.block_size % 32 != 0:
+            logger.warning(
+                "CPU backend prefers block_size is multiples of 32, "
+                "otherwise the performance is not optimized."
             )
 
         scheduler_config = vllm_config.scheduler_config
@@ -207,22 +200,11 @@ class CpuPlatform(Platform):
                 "backend is not compatible with FP8 KV cache."
             )
 
-        if cache_config.cache_dtype == "fp8_e4m3":
-            cache_config.cache_dtype = "fp8_e5m2"
+        if cache_config.cache_dtype != "auto":
             logger.warning(
-                "CPU backend doesn't support fp8_e4m3 KV cache type, cast to fp8_e5m2."
+                "CPU backend doesn't support KV cache quantization fallback to auto."
             )
-
-        if (
-            cache_config.cache_dtype != "auto"
-            and model_config is not None
-            and model_config.dtype == torch.half
-        ):
-            logger.warning(
-                "FP8 KV cache on the CPU backend only does not"
-                " support fp16 for now, cast to bf16."
-            )
-            model_config.dtype = torch.bfloat16
+            cache_config.cache_dtype = "auto"
 
         cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 32734c3aba5ef..ebcc290a64cd7 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -22,10 +22,13 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
 else:
-    _Backend = None
+    AttentionBackendEnum = None
+    VllmConfig = None
+    CacheDType = None
 
 logger = init_logger(__name__)
 
@@ -39,6 +42,49 @@ pynvml = import_pynvml()
 torch.backends.cuda.enable_cudnn_sdp(False)
 
 
+@cache
+def _get_backend_priorities(
+    use_mla: bool,
+    device_capability: DeviceCapability,
+) -> list[AttentionBackendEnum]:
+    """Get backend priorities with lazy import to avoid circular dependency."""
+    from vllm.attention.backends.registry import AttentionBackendEnum
+
+    if use_mla:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.CUTLASS_MLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+    else:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
+
+
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @wraps(fn)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
@@ -216,217 +262,164 @@ class CudaPlatformBase(Platform):
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         # For Blackwell GPUs, force TORCH_SDPA for now.
         # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
         if cls.has_device_capability(100):
-            return _Backend.TORCH_SDPA
+            return AttentionBackendEnum.TORCH_SDPA
 
         if dtype not in (torch.float16, torch.bfloat16):
-            return _Backend.XFORMERS
+            return AttentionBackendEnum.XFORMERS
 
         if cls.has_device_capability(80):
-            FLASH_ATTN_V1 = (
-                "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-            )
-            from vllm.attention.selector import is_attn_backend_supported
-
-            is_default_fa_supported = is_attn_backend_supported(
-                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
-            )
-            if is_default_fa_supported:
-                return _Backend.FLASH_ATTN
+            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+            if backend_class.supports_head_size(
+                head_size
+            ) and backend_class.supports_dtype(dtype):
+                return AttentionBackendEnum.FLASH_ATTN
             else:
-                # Fallback to XFORMERS
-                return _Backend.XFORMERS
+                return AttentionBackendEnum.XFORMERS
         else:
             # Fallback for Volta/Turing GPUs or FA not supported
-            return _Backend.XFORMERS
+            return AttentionBackendEnum.XFORMERS
 
     @classmethod
-    def get_attn_backend_cls(
+    def get_valid_backends(
         cls,
-        selected_backend,
         head_size,
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
+        device_capability,
+    ) -> tuple[
+        list[tuple["AttentionBackendEnum", int]],
+        dict["AttentionBackendEnum", list[str]],
+    ]:
+        valid_backends_priorities = []
+        invalid_reasons = {}
+
+        backend_priorities = _get_backend_priorities(use_mla, device_capability)
+        for priority, backend in enumerate(backend_priorities):
+            try:
+                backend_class = backend.get_class()
+                invalid_reasons_i = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    block_size,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                )
+            except ImportError:
+                invalid_reasons_i = ["ImportError"]
+            if invalid_reasons_i:
+                invalid_reasons[backend] = invalid_reasons_i
+            else:
+                valid_backends_priorities.append((backend, priority))
+
+        return valid_backends_priorities, invalid_reasons
+
+    @classmethod
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        device_capability = cls.get_device_capability()
+        assert device_capability is not None
 
-        if use_mla:
-            # explicitly reject non-MLA backends when MLA is enabled to avoid
-            # silently selecting an incompatible backend (e.g., FLASHINFER).
-            if selected_backend in {
-                _Backend.FLASHINFER,
-                _Backend.FLASH_ATTN,
-                _Backend.TRITON_ATTN,
-                _Backend.TREE_ATTN,
-                _Backend.XFORMERS,
-            }:
+        # First try checking just the selected backend, if there is one.
+        if selected_backend is not None:
+            try:
+                backend_class = selected_backend.get_class()
+                invalid_reasons = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    None,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                )
+            except ImportError:
+                invalid_reasons = ["ImportError"]
+            if invalid_reasons:
                 raise ValueError(
-                    f"Attention backend {selected_backend} incompatible with MLA. "
-                    "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
-                    "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
-                    "VLLM_MLA_DISABLE=1 to disable MLA for this model."
+                    f"Selected backend {selected_backend} is not valid for "
+                    f"this configuration. Reason: {invalid_reasons}"
                 )
+            else:
+                logger.info("Using %s backend.", selected_backend)
+                return selected_backend.get_path()
 
-            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
-            from vllm.attention.utils.fa_utils import flash_attn_supports_mla
-
-            if use_sparse:
-                logger.info_once("Using Sparse MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashmla_sparse."
-                    "FlashMLASparseBackend"
-                )
-
-            use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
-                selected_backend is None
-                and cls.is_device_capability(100)
-                and block_size % 128 == 0
-            )
-            use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or (
-                selected_backend is None
-                and cls.is_device_capability(100)
-                and (block_size == 32 or block_size % 64 == 0)
-            )
-            use_flashmla = selected_backend == _Backend.FLASHMLA or (
-                selected_backend is None and is_flashmla_dense_supported()[0]
-            )
-            use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or (
-                selected_backend is None and flash_attn_supports_mla()
-            )
-            use_triton = selected_backend == _Backend.TRITON_MLA or (
-                selected_backend is None
-            )
-
-            if use_cutlassmla:
-                logger.info_once("Using Cutlass MLA backend.", scope="local")
-                return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
-            if use_flashinfermla:
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                set_kv_cache_layout("HND")
-                logger.info_once("Using FlashInfer MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
-                )
-            if use_flashmla:
-                if block_size % 64 != 0:
-                    logger.warning(
-                        "FlashMLA backend is not supported for block size %d"
-                        " (currently only supports block size 64).",
-                        block_size,
-                    )
-                else:
-                    logger.info_once("Using FlashMLA backend.")
-                    return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
-            if use_flashattn:
-                logger.info_once("Using FlashAttention MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
-                )
-            if use_triton:
-                logger.info_once("Using Triton MLA backend.")
-                return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
-
-        FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
-        FLEX_ATTENTION_V1 = (
-            "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+        # No selected backend or the selected backend is invalid,
+        # so we try finding a valid backend.
+        valid_backends_priorities, invalid_reasons = cls.get_valid_backends(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            None,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
         )
-        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-        FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-        TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
-        XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
+        reasons_str = (
+            "{"
+            + ", ".join(
+                f"{backend.name}: [{', '.join(reasons)}]"
+                for backend, reasons in invalid_reasons.items()
+            )
+            + "}"
+        )
+        config_str = (
+            f"head_size: {head_size}, dtype: {dtype}, "
+            f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, "
+            f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}"
+        )
+        logger.debug_once(
+            f"Some attention backends are not valid for {cls.device_name} with "
+            f"{config_str}. Reasons: {reasons_str}."
+        )
+        if len(valid_backends_priorities) == 0:
+            raise ValueError(
+                f"No valid attention backend found for {cls.device_name} "
+                f"with {config_str}. Reasons: {reasons_str}."
+            )
 
-        use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith(
-            "fp8"
+        # We have found some valid backends. Select the one with the
+        # highest priority.
+        logger.info(
+            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
+        )
+        sorted_indices = sorted(
+            range(len(valid_backends_priorities)),
+            key=lambda i: valid_backends_priorities[i][1],
+        )
+        selected_index = sorted_indices[0]
+        selected_backend = valid_backends_priorities[selected_index][0]
+        logger.info(
+            "Using %s backend.",
+            selected_backend.name,
         )
 
-        if selected_backend == _Backend.FLASHINFER:
-            logger.info_once("Using FlashInfer backend.")
-            if cls.has_device_capability(100):
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                set_kv_cache_layout("HND")
-            return FLASHINFER_V1
-        elif selected_backend == _Backend.FLEX_ATTENTION:
-            logger.info_once("Using FlexAttention backend.")
-            return FLEX_ATTENTION_V1
-        elif selected_backend == _Backend.TRITON_ATTN:
-            logger.info_once("Using Triton backend.")
-            return TRITON_ATTN
-        elif selected_backend == _Backend.FLASH_ATTN:
-            logger.info_once("Using Flash Attention backend.")
-            return FLASH_ATTN_V1
-        elif selected_backend == _Backend.TREE_ATTN:
-            logger.info_once("Using Tree Attention backend.")
-            return TREE_ATTN_V1
-        elif selected_backend == _Backend.XFORMERS:
-            logger.info_once("Using XFormers backend.")
-            return XFORMERS_V1
-
-        from vllm.attention.selector import is_attn_backend_supported
-
-        # Default backends for V1 engine
-        # Prefer FlashInfer for Blackwell GPUs if installed
-        if cls.is_device_capability(100):
-            if is_default_backend_supported := is_attn_backend_supported(
-                FLASHINFER_V1, head_size, dtype
-            ):
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                logger.info_once(
-                    "Using FlashInfer backend with HND KV cache layout on "
-                    "V1 engine by default for Blackwell (SM 10.0) GPUs."
-                )
-                set_kv_cache_layout("HND")
-
-                return FLASHINFER_V1
-
-            if not is_default_backend_supported.can_import:
-                logger.warning_once(
-                    "FlashInfer failed to import on Blackwell (SM 10.0) GPUs; "
-                    "it is recommended to install FlashInfer for better "
-                    "performance."
-                )
-
-        # FlashAttention is the default for SM 8.0+ GPUs
-        if cls.has_device_capability(80):
-            if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
-                logger.info_once("Using Triton backend.")
-                return TRITON_ATTN
-            elif is_default_backend_supported := is_attn_backend_supported(
-                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
-            ):
-                logger.info_once("Using Flash Attention backend.")
-                return FLASH_ATTN_V1
-
-        # FlexAttention is the default for older GPUs
-        else:
-            logger.info_once("Using FlexAttention backend.")
-            return FLEX_ATTENTION_V1
-
-        assert not is_default_backend_supported
-
-        use_flex_attention_reason = {}
-        if not is_default_backend_supported.head_size:
-            use_flex_attention_reason["head_size"] = head_size
-        if not is_default_backend_supported.dtype:
-            use_flex_attention_reason["dtype"] = dtype
-
-        logger.info_once(
-            "Using FlexAttention backend for %s.",
-            ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
-        )
-        return FLEX_ATTENTION_V1
+        return selected_backend.get_path()
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 15e3b3a22bdee..12c377384270e 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -17,8 +17,9 @@ from vllm.logger import init_logger
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
@@ -58,6 +59,31 @@ class DeviceCapability(NamedTuple):
     major: int
     minor: int
 
+    def __lt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) < (other.major, other.minor)
+
+    def __le__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) <= (other.major, other.minor)
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) == (other.major, other.minor)
+
+    def __ge__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) >= (other.major, other.minor)
+
+    def __gt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) > (other.major, other.minor)
+
     def as_version_str(self) -> str:
         return f"{self.major}.{self.minor}"
 
@@ -145,7 +171,11 @@ class Platform:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     def is_sleep_mode_available(self) -> bool:
-        return self._enum == PlatformEnum.CUDA
+        # TODO: Actually only mi3xx has the sleep mode support now
+        # for ROCm, but currently we don't have a way to detect the
+        # exact GPU model statelessly here. So we return True for
+        # all ROCm platforms for now.
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
@@ -173,21 +203,22 @@ class Platform:
             import vllm._moe_C  # noqa: F401
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
-        # Import _Backend here to avoid circular import.
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        # Import AttentionBackendEnum here to avoid circular import.
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        return _Backend.TORCH_SDPA
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: str | None,
+        kv_cache_dtype: "CacheDType | None",
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
@@ -514,13 +545,6 @@ class Platform:
             cls._global_graph_pool = self.graph_pool_handle()
         return cls._global_graph_pool
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        """
-        Returns the total number of compute units (CU) on single GPU.
-        """
-        raise NotImplementedError
-
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e6536a02a73dd..d20dc9e6b0674 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -14,10 +14,10 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -49,25 +49,8 @@ _ROCM_UNSUPPORTED_MODELS: list[str] = []
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_SWA_REASON = (
-    "Sliding window attention (SWA) is not yet supported in "
-    "Triton flash attention. For half-precision SWA support, "
-    "please use CK flash attention by setting "
-    "`VLLM_USE_TRITON_FLASH_ATTN=0`"
-)
-_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
-    "Qwen2ForCausalLM": _ROCM_SWA_REASON,
-    "MistralForCausalLM": _ROCM_SWA_REASON,
-    "MixtralForCausalLM": _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration": (
-        "ROCm flash attention does not yet fully support 32-bit precision on PaliGemma"
-    ),
-    "Phi3VForCausalLM": (
-        "ROCm Triton flash attention may run into compilation errors due to "
-        "excessive use of shared memory. If this happens, disable Triton FA "
-        "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
-    ),
-}
+_ROCM_SWA_REASON = ()
+_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {}
 _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
     "0x74a0": "AMD_Instinct_MI300A",
     "0x74a1": "AMD_Instinct_MI300X",
@@ -204,21 +187,23 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> AttentionBackendEnum:
         from importlib.util import find_spec
 
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
             # TODO: Add support for other VL models in their model class.
-            return _Backend.ROCM_AITER_FA
+            return AttentionBackendEnum.ROCM_AITER_FA
 
         if on_gfx9() and find_spec("flash_attn") is not None:
-            return _Backend.FLASH_ATTN
+            return AttentionBackendEnum.FLASH_ATTN
 
-        return _Backend.TORCH_SDPA
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_attn_backend_cls(
@@ -228,75 +213,65 @@ class RocmPlatform(Platform):
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on ROCm.")
 
-        if not use_v1:
-            raise RuntimeError(
-                "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-                "to select a supported backend."
-            )
-
         if use_mla:
             if selected_backend is None:
                 selected_backend = (
-                    _Backend.ROCM_AITER_MLA
+                    AttentionBackendEnum.ROCM_AITER_MLA
                     if rocm_aiter_ops.is_mla_enabled() or block_size == 1
-                    else _Backend.TRITON_MLA
+                    else AttentionBackendEnum.TRITON_MLA
                 )
 
-            if selected_backend == _Backend.TRITON_MLA:
+            if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
                     logger.info_once("Using Triton MLA backend.")
-                    return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+                    return AttentionBackendEnum.TRITON_MLA.get_path()
                 raise ValueError(
                     f" The selected backend, {selected_backend.name},"
                     f"does not support block size {block_size}."
                 )
-            if selected_backend == _Backend.ROCM_AITER_MLA:
+            if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
                 logger.info("Using AITER MLA backend.")
-                return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if selected_backend == _Backend.FLEX_ATTENTION:
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
             logger.info("Using FlexAttention backend.")
             return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
         if (
             rocm_aiter_ops.is_mha_enabled()
-        ) or selected_backend == _Backend.ROCM_AITER_FA:
+        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
             logger.info("Using Aiter Flash Attention backend.")
-            return "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
         if (
             rocm_aiter_ops.is_triton_unified_attn_enabled()
-        ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
             logger.info("Using Aiter Unified Attention backend.")
-            return (
-                "vllm.v1.attention.backends."
-                "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
-            )
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
         if (
             envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == _Backend.ROCM_ATTN
+            or selected_backend == AttentionBackendEnum.ROCM_ATTN
         ):
             # rocm specific backend, with aiter and/or
             #   triton prefix-prefill
             logger.info("Using Rocm Attention backend.")
-            return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+            return AttentionBackendEnum.ROCM_ATTN.get_path()
         # default case, using triton unified attention
         logger.info("Using Triton Attention backend.")
-        return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+        return AttentionBackendEnum.TRITON_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -350,6 +325,7 @@ class RocmPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm._aiter_ops import rocm_aiter_ops
         from vllm.config.compilation import CUDAGraphMode
 
         cache_config = vllm_config.cache_config
@@ -357,9 +333,7 @@ class RocmPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
-        use_aiter_rms_norm = (
-            envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM
-        )
+        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
 
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
@@ -449,10 +423,6 @@ class RocmPlatform(Platform):
     def opaque_attention_op(cls) -> bool:
         return True
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        return torch.cuda.get_device_properties(device_id).multi_processor_count
-
     @classmethod
     def is_navi(cls) -> bool:
         return "gfx1" in torch.cuda.get_device_properties(0).gcnArchName
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 1a4b67a1762f3..4773fef6829d1 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -15,16 +15,15 @@ from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
 else:
     BlockSize = None
-    ModelConfig = None
     VllmConfig = None
     PoolingParams = None
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -54,27 +53,24 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink,
         use_sparse,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
-        if selected_backend != _Backend.PALLAS:
+        if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
-        if not use_v1:
-            raise ValueError("TPU backend only supports V1.")
         logger.info("Using Pallas V1 backend.")
-        return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+        return AttentionBackendEnum.PALLAS.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index e4ecd0c807dac..c629325f76a32 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -14,12 +14,11 @@ from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
 else:
-    ModelConfig = None
     VllmConfig = None
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -44,12 +43,11 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse,
@@ -62,26 +60,24 @@ class XPUPlatform(Platform):
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
-        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-        FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-        if selected_backend == _Backend.TRITON_ATTN:
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info_once("Using Triton backend.")
-            return TRITON_ATTN
-        elif selected_backend == _Backend.FLASH_ATTN:
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+        elif selected_backend == AttentionBackendEnum.FLASH_ATTN:
             logger.info_once("Using Flash Attention backend.")
-            return FLASH_ATTN
+            return AttentionBackendEnum.FLASH_ATTN.get_path()
         elif selected_backend:
             raise ValueError(
                 f"Invalid attention backend for {cls.device_name}, "
-                f"with use_v1: {use_v1} use_mla: {use_mla}"
+                f"with use_mla: {use_mla}"
             )
 
         logger.info("Using Flash Attention backend.")
-        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        return AttentionBackendEnum.FLASH_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -105,7 +101,11 @@ class XPUPlatform(Platform):
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        xpu_use_triton_kernel = os.getenv("XPU_USE_TRITON_KERNEL", "0") == "1"
+        if not xpu_use_triton_kernel:
+            return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        else:
+            return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -113,10 +113,12 @@ class XPUPlatform(Platform):
         return device_props.total_memory
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        return _Backend.FLASH_ATTN
+        return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
     def inference_mode(cls):
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index c8bff8b7c80b6..69226763aafe6 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -37,12 +37,10 @@ _GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
-    "VLLM_USE_TRITON_FLASH_ATTN",
     "VLLM_ATTENTION_BACKEND",
     "VLLM_USE_FLASHINFER_SAMPLER",
     "VLLM_PP_LAYER_PARTITION",
     "VLLM_USE_TRITON_AWQ",
-    "VLLM_USE_V1",
     "VLLM_ENABLE_V1_MULTIPROCESSING",
 ]
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index b5a7fea2c3571..9b0045279a67e 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -57,7 +57,6 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
 STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index a928cce09011f..4c15baf7a8f93 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -49,10 +49,6 @@ def is_deep_gemm_e8m0_used() -> bool:
         logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    if envs.VLLM_USE_FLASHINFER_MOE_FP8:
-        logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
-        return False
-
     if envs.VLLM_USE_DEEP_GEMM_E8M0:
         logger.info_once("DeepGEMM E8M0 enabled on current platform.")
         return True
@@ -77,6 +73,7 @@ _fp8_paged_mqa_logits_impl: Callable[..., Any] | None = None
 _get_paged_mqa_logits_metadata_impl: Callable[..., Any] | None = None
 _get_mn_major_tma_aligned_tensor_impl: Callable[..., Any] | None = None
 _get_mk_alignment_for_contiguous_layout_impl: Callable[..., Any] | None = None
+_transform_sf_into_required_layout_impl: Callable[..., Any] | None = None
 
 
 def _lazy_init() -> None:
@@ -86,6 +83,7 @@ def _lazy_init() -> None:
     global _get_paged_mqa_logits_metadata_impl
     global _get_mn_major_tma_aligned_tensor_impl
     global _get_mk_alignment_for_contiguous_layout_impl
+    global _transform_sf_into_required_layout_impl
     # fast path
     if (
         _fp8_gemm_nt_impl is not None
@@ -95,6 +93,7 @@ def _lazy_init() -> None:
         or _fp8_paged_mqa_logits_impl is not None
         or _get_paged_mqa_logits_metadata_impl is not None
         or _get_mk_alignment_for_contiguous_layout_impl is not None
+        or _transform_sf_into_required_layout_impl is not None
     ):
         return
 
@@ -124,6 +123,9 @@ def _lazy_init() -> None:
     _get_mk_alignment_for_contiguous_layout_impl = getattr(
         _dg, "get_mk_alignment_for_contiguous_layout", None
     )
+    _transform_sf_into_required_layout_impl = getattr(
+        _dg, "transform_sf_into_required_layout", None
+    )
 
 
 def get_num_sms() -> int:
@@ -179,6 +181,15 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     )
 
 
+def transform_sf_into_required_layout(*args, **kwargs):
+    _lazy_init()
+    if _transform_sf_into_required_layout_impl is None:
+        return _missing(*args, **kwargs)
+    return _transform_sf_into_required_layout_impl(
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
+    )
+
+
 def fp8_mqa_logits(
     q: torch.Tensor,
     kv: tuple[torch.Tensor, torch.Tensor],
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5101020fda12f..62af39513d651 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -35,9 +35,20 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
 )
 
 
+@functools.cache
+def has_flashinfer_cubin() -> bool:
+    """Return `True` if flashinfer-cubin package is available."""
+    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+        return True
+    if importlib.util.find_spec("flashinfer_cubin") is not None:
+        return True
+    logger.debug_once("flashinfer-cubin package was not found")
+    return False
+
+
 @functools.cache
 def has_flashinfer() -> bool:
-    """Return `True` if FlashInfer is available."""
+    """Return `True` if flashinfer-python package is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
     if importlib.util.find_spec("flashinfer") is None:
@@ -45,7 +56,7 @@ def has_flashinfer() -> bool:
         return False
     # When not using flashinfer cubin,
     # Also check if nvcc is available since it's required to JIT compile flashinfer
-    if not envs.VLLM_HAS_FLASHINFER_CUBIN and shutil.which("nvcc") is None:
+    if not has_flashinfer_cubin() and shutil.which("nvcc") is None:
         logger.debug_once(
             "FlashInfer unavailable since nvcc was not found "
             "and not using pre-downloaded cubins"
@@ -183,9 +194,8 @@ def has_nvidia_artifactory() -> bool:
     This checks connectivity to the kernel inference library artifactory
     which is required for downloading certain cubin kernels like TRTLLM FHMA.
     """
-    # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
-    # it's true, we could assume the cubins are available.
-    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+    # If we have pre-downloaded cubins, we can assume the cubins are available.
+    if has_flashinfer_cubin():
         return True
 
     try:
@@ -208,9 +218,13 @@ def has_nvidia_artifactory() -> bool:
 @functools.cache
 def supports_trtllm_attention() -> bool:
     """
-    TRTLLM attention is supported if the platform is SM100 and
-    NVIDIA artifactory is accessible
+    TRTLLM attention is supported if the platform is SM100,
+    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
     """
+    # Batch-invariant mode disables TRTLLM attention
+    if vllm_is_batch_invariant():
+        return False
+
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
     return current_platform.is_device_capability(100) and has_nvidia_artifactory()
 
@@ -229,9 +243,6 @@ def force_use_trtllm_attention() -> bool | None:
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
     """
-    if vllm_is_batch_invariant():
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is disabled for batch-invariant")
-        return False
     return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
 
 
diff --git a/vllm/utils/platform_utils.py b/vllm/utils/platform_utils.py
index 34ac820c6e9d6..433c6734e8a92 100644
--- a/vllm/utils/platform_utils.py
+++ b/vllm/utils/platform_utils.py
@@ -24,6 +24,11 @@ def xpu_is_initialized() -> bool:
     return torch.xpu.is_initialized()
 
 
+def get_cu_count(device_id: int = 0) -> int:
+    """Returns the total number of compute units (CU) on single GPU."""
+    return torch.cuda.get_device_properties(device_id).multi_processor_count
+
+
 def cuda_get_device_properties(
     device, names: Sequence[str], init_cuda=False
 ) -> tuple[Any, ...]:
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index fd5c1b73f1910..7c094e14cff72 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -409,6 +409,30 @@ def current_stream() -> torch.cuda.Stream:
     return _current_stream_tls.value
 
 
+# Global auxilary stream for running operations in background streams.
+# We have single global auxilary stream to avoid an explosion of streams
+# for every layer (and make profiling look sane).
+#
+# aux_stream() is currently used for:
+#   - MoE shared_expert overlap with router
+_aux_stream: torch.cuda.Stream | None = None
+
+
+def aux_stream() -> torch.cuda.Stream | None:
+    """
+    Ensures aux_stream is initialized only once
+    """
+    global _aux_stream
+
+    from vllm.platforms import current_platform
+
+    # TODO: validate this works properly on ROCm platform.
+    if _aux_stream is None and current_platform.is_cuda():
+        _aux_stream = torch.cuda.Stream()
+
+    return _aux_stream
+
+
 @lru_cache(maxsize=8)
 def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
     # Note: cuda_visible_devices is not used, but we keep it as an argument for
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 20d987fa2de3b..674398e19c4ce 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,22 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar
 
-import numpy as np
 import torch
-from torch.nn.functional import scaled_dot_product_attention
 
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
     AttentionLayer,
-    AttentionMetadata,
     AttentionType,
     is_quantized_kv_cache,
 )
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
@@ -24,51 +23,38 @@ from vllm.v1.attention.backends.utils import (
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-try:
-    import intel_extension_for_pytorch.llm.modules as ipex_modules
-
-    _use_ipex = True
-# AttributeError is to handle a bug in ipex
-# https://github.com/intel/intel-extension-for-pytorch/pull/813
-except (ImportError, AttributeError):
-    _use_ipex = False
-
-from vllm import _custom_ops as ops
-
 logger = init_logger(__name__)
 
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
 
-class TorchSDPABackend(AttentionBackend):
-    accept_output_buffer: bool = False
+
+class CPUAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
         return [torch.float16, torch.bfloat16, torch.float32]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        attn_impl = _get_paged_attn_impl()
-        is_valid, supported_head_sizes = attn_impl.validate_head_size(head_size)
-        if not is_valid:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
     def get_name() -> str:
-        return "TORCH_SDPA"
+        return "CPU_ATTN"
 
     @staticmethod
-    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
+    def get_impl_cls() -> type["CPUAttentionBackendImpl"]:
+        return CPUAttentionBackendImpl
 
     @staticmethod
-    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
-        return TorchSDPAMetadataBuilderV1
+    def get_builder_cls() -> type["CPUAttentionMetadataBuilder"]:
+        return CPUAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -78,9 +64,7 @@ class TorchSDPABackend(AttentionBackend):
         head_size: int,
         cache_dtype_str: str = "auto",
     ) -> tuple[int, ...]:
-        return _get_paged_attn_impl().get_kv_cache_shape(
-            num_blocks, block_size, num_kv_heads, head_size
-        )
+        return 2, num_blocks, num_kv_heads, block_size, head_size
 
     @staticmethod
     def use_cascade_attention(*args, **kwargs) -> bool:
@@ -88,264 +72,26 @@ class TorchSDPABackend(AttentionBackend):
 
 
 @dataclass
-class TorchSDPAMetadata(AttentionMetadata):
-    """Attention metadata for prefill and decode batched together."""
-
-    # Total number of prefill requests.
-    num_prefills: int
-    # Number of prefill tokens.
-    num_prefill_tokens: int
-    # Number of decode tokens. Note that it is equivalent to the number of
-    # decode requests.
-    num_decode_tokens: int
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
+class CPUAttentionMetadata:
+    isa: str
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
     slot_mapping: torch.Tensor
-    """Metadata for PagedAttention."""
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    decode_seq_lens_tensor: torch.Tensor | None
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    decode_max_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    decode_block_tables: torch.Tensor | None
-    """Metadata for TorchSDPABackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    chunked_prefill: bool
-    seq_lens: list[int] | None = None  # For non-chunked prefill
+    scheduler_metadata: torch.Tensor | None
+    causal: bool = True
 
-    # For chunked prefill only
-    max_query_len: int | None = None
-    prefill_max_seq_len: int | None = None
-    prefill_query_start_loc: torch.Tensor | None = None
-    prefill_seq_start_loc: torch.Tensor | None = None
-    prefill_block_tables: torch.Tensor | None = None
-
-    # For V1 logits index only
-    query_start_loc: torch.Tensor | None = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: list[int] | None = None
-    encoder_seq_lens_tensor: torch.Tensor | None = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: int | None = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: int | None = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: torch.Tensor | None = None
-    cross_block_tables: torch.Tensor | None = None
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: list[torch.Tensor] | None = None
-        self.encoder_attn_bias: list[torch.Tensor] | None = None
-        self.cross_attn_bias: list[torch.Tensor] | None = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        """
-        All attention metadata required for encoder attention is set.
-        """
-        return (
-            (self.encoder_seq_lens is not None)
-            and (self.encoder_seq_lens_tensor is not None)
-            and (self.max_encoder_seq_len is not None)
-        )
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        """
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        """
-        return (
-            self.is_all_encoder_attn_metadata_set
-            and (self.cross_slot_mapping is not None)
-            and (self.cross_block_tables is not None)
-        )
-
-    @property
-    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_prefill_tokens == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-    def get_seq_lens(
-        self,
-        attn_type: str,
-    ):
-        """
-        Extract appropriate sequence lengths from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate sequence lengths tensor for query
-        * Appropriate sequence lengths tensor for key & value
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.seq_lens
-        elif attn_type == AttentionType.ENCODER:
-            seq_lens_q = self.encoder_seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-        return seq_lens_q, seq_lens_kv
-
-    def get_attn_bias(
-        self,
-        attn_type: str,
-    ) -> list[torch.Tensor] | None:
-        """
-        Extract appropriate attention bias from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate attention bias value given the attention type
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            return self.attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            return self.encoder_attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            return self.cross_attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def set_attn_bias(
-        self,
-        attn_bias: list[torch.Tensor],
-        attn_type: str,
-    ) -> None:
-        """
-        Update appropriate attention bias field of attention metadata,
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_bias: The desired attention bias value
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            self.attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            self.encoder_attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            self.cross_attn_bias = attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def get_seq_len_block_table_args(
-        self,
-        attn_type: str,
-    ) -> tuple:
-        """
-        The particular choice of sequence-length- and block-table-related
-        attributes which should be extracted from attn_metadata is dependent
-        on the type of attention operation.
-
-        Decoder attn -> select entirely decoder self-attention-related fields
-        Encoder/decoder cross-attn -> select encoder sequence lengths &
-                                    cross-attn block-tables fields
-        Encoder attn -> select encoder sequence lengths fields & no block tables
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * is_prompt: True if prefill, False otherwise
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-
-        * Appropriate sequence-lengths tensor
-        * Appropriate max sequence-length scalar
-        * Appropriate block tables (or None)
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            # Decoder self-attention
-            # Choose max_seq_len based on whether we are in prompt_run
-            return (
-                self.decode_seq_lens_tensor,
-                self.decode_max_seq_len,
-                self.decode_block_tables,
-            )
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            # Enc/dec cross-attention KVs match encoder sequence length;
-            # cross-attention utilizes special "cross" block tables
-            return (
-                self.encoder_seq_lens_tensor,
-                self.max_encoder_seq_len,
-                self.cross_block_tables,
-            )
-        elif attn_type == AttentionType.ENCODER:
-            # No block tables associated with encoder attention
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, None)
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+    # can be removed after deprecate sdpa
+    use_sdpa_prefill: bool = False
+    num_decode_tokens: int = 0
+    sdpa_attn_masks: list[torch.Tensor | None] | None = None
+    sdpa_start_loc: torch.Tensor | None = None
 
 
-class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
-    reorder_batch_threshold: int = 1
-
+class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]):
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -355,80 +101,104 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
     ) -> None:
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
-        self.scheduler_config = vllm_config.scheduler_config
-        self._init_reorder_batch_threshold(1, False)
+        self.use_sdpa_prefill = False
+        reorder_batch_threshold = None
+        if current_platform.get_cpu_architecture() not in _CPU_ARCH_PREFER_MIXED_BATCH:
+            # in this case, decode seqs are reordered to the front of prefill seqs
+            # to split decode and prefill. Then use SDPA for prefill and
+            # cpu_attention_with_kv_cache for decode
+            reorder_batch_threshold = 1
+            self.use_sdpa_prefill = True
 
-        self.seq_start_loc_cpu = torch.zeros(
-            vllm_config.scheduler_config.max_num_seqs + 1,
-            dtype=torch.int32,
-            device="cpu",
+        self._init_reorder_batch_threshold(reorder_batch_threshold, False)
+
+        self.kv_cache_spec = kv_cache_spec
+        self.vllm_config = vllm_config
+
+        parallel_config = vllm_config.parallel_config
+        self.num_kv_heads = vllm_config.model_config.get_num_kv_heads(parallel_config)
+        self.num_heads = vllm_config.model_config.get_num_attention_heads(
+            parallel_config
         )
-        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+        self.head_dim = kv_cache_spec.head_size
+        self.dtype = vllm_config.model_config.dtype
+        self.window_size = getattr(kv_cache_spec, "sliding_window", -1)
+        if self.window_size is None:
+            self.window_size = -1
+        self.block_size = vllm_config.cache_config.block_size
+        self.isa = _get_attn_isa(self.dtype, self.block_size)
 
     def build(
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
-    ) -> TorchSDPAMetadata:
+    ) -> CPUAttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
-
-        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        query_start_loc_np = query_start_loc_cpu.numpy()
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold,
-                require_uniform=True,
-            )
-        )
-
-        max_prefill_seq_len = (
-            seq_lens_np[num_decodes:num_reqs].max().item() if num_prefills > 0 else 0
-        )
-        max_decode_seq_len = (
-            seq_lens_np[:num_decodes].max().item() if num_prefills < num_reqs else 0
-        )
-        self.seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1 : num_reqs + 1])
-
-        slot_mapping = common_attn_metadata.slot_mapping.long()
+        max_seq_len = common_attn_metadata.max_seq_len
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
-        query_start_loc_np = query_start_loc_cpu.numpy()
-        query_start_loc_np[num_decodes : num_reqs + 1] -= num_decode_tokens
+        slot_mapping = common_attn_metadata.slot_mapping
+        causal = common_attn_metadata.causal
 
-        attn_metadata = TorchSDPAMetadata(
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            slot_mapping=slot_mapping,
-            # to ensure inference when chunked_prefill is disabled
-            seq_lens=seq_lens_cpu.tolist()[num_decodes:],  # prefill
-            decode_seq_lens_tensor=seq_lens_cpu[:num_decodes],  # decode
-            decode_max_seq_len=max_decode_seq_len,  # decode
-            decode_block_tables=block_table_tensor[:num_decodes],  # decode
-            chunked_prefill=self.scheduler_config.chunked_prefill_enabled,
+        sdpa_start_loc = query_start_loc
+        num_decode_tokens = 0
+        if self.use_sdpa_prefill and causal:
+            # Decoder, need reorder and truncate
+            assert self.reorder_batch_threshold
+            (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) = (
+                split_decodes_and_prefills(
+                    common_attn_metadata,
+                    decode_threshold=self.reorder_batch_threshold,
+                    require_uniform=True,
+                )
+            )
+            num_reqs = num_decodes
+            sdpa_start_loc = sdpa_start_loc[num_decodes:] - num_decode_tokens
+            seq_lens = seq_lens[:num_decodes]
+            query_start_loc = query_start_loc[: num_decodes + 1]
+            block_table_tensor = block_table_tensor[:num_decodes]
+
+        sheduler_metadata = None
+        if causal:
+            # for decode batch, use the custom kernel
+            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+                num_reqs=num_reqs,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_dim=self.head_dim,
+                seq_lens=seq_lens,
+                dtype=self.dtype,
+                query_start_loc=query_start_loc,
+                causal=causal,
+                sliding_window_size=self.window_size,
+                isa=self.isa,
+                enable_kv_split=True,
+            )
+
+        attn_metadata = CPUAttentionMetadata(
+            isa=self.isa,
+            num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
-            prefill_max_seq_len=max_prefill_seq_len,
-            prefill_query_start_loc=query_start_loc_cpu[
-                num_decodes : num_reqs + 1
-            ],  # prefill
-            prefill_seq_start_loc=self.seq_start_loc_cpu[
-                num_decodes : num_reqs + 1
-            ],  # prefill
-            prefill_block_tables=block_table_tensor[num_decodes:num_reqs],  # prefill
-            query_start_loc=query_start_loc_cpu[: num_reqs + 1],  # for logits index
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            scheduler_metadata=sheduler_metadata,
+            causal=causal,
+            use_sdpa_prefill=self.use_sdpa_prefill,
+            num_decode_tokens=num_decode_tokens,
+            sdpa_start_loc=sdpa_start_loc,
         )
 
         return attn_metadata
 
 
-class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
+class CPUAttentionBackendImpl(AttentionImpl):
     def __init__(
         self,
         num_heads: int,
@@ -441,37 +211,48 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if logits_soft_cap is not None:
-            logger.warning_once(
-                "Torch SPDA does not support logits soft cap. "
-                "Outputs may be slightly off."
-            )
-        self.paged_attn_impl = _get_paged_attn_impl()
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
+        if logits_soft_cap is not None and attn_type in (
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        ):
+            logger.warning_once(
+                "CPU_ATTN does not support logits softcap for"
+                " ENCODER and ENCODER_ONLY, outputs may be slightly off"
+            )
+        if logits_soft_cap is None:
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
         self.num_kv_heads = num_kv_heads
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        elif attn_type == AttentionType.ENCODER_ONLY:
+            self.sliding_window = (sliding_window - 1, sliding_window - 1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
-
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (
-            self.alibi_slopes is not None or self.sliding_window is not None
-        )
 
-        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
-            raise NotImplementedError(
-                "Torch SDPA backend FP8 KV cache requires "
-                "intel_extension_for_pytorch support."
-            )
+        if is_quantized_kv_cache(kv_cache_dtype):
+            raise NotImplementedError("FP8 KV cache is unsupported in CPU_ATTN")
         self.attn_type = attn_type
 
+        self.sinks = sinks
+        if self.sinks is not None:
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer"
+            )
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -479,196 +260,130 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
+        attn_metadata: CPUAttentionMetadata | None,
         output: torch.Tensor | None = None,
         output_scale: torch.Tensor | None = None,
         output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        """Forward pass with torch SDPA and PagedAttention.
+        """Forward pass for CPU attention backend.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
             kv_cache: shape =
-                [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
+                [2, num_blocks, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
-                " for TorchSDPABackendImpl"
+                " for CPUAttentionBackendImpl"
             )
 
         # For warming-up
         if attn_metadata is None:
-            return query
+            return output
 
-        attn_type = self.attn_type
-        if attn_type == AttentionType.ENCODER and (
-            not attn_metadata.is_all_encoder_attn_metadata_set
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            return self._run_sdpa_forward(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
+            )
+
+        # For decoder and cross-attention, use KV cache, size are
+        # [num_blocks, num_kv_heads, block_size, head_size]
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
         ):
-            raise AttributeError(
-                "Encoder attention requires setting encoder metadata attributes."
-            )
-        elif attn_type == AttentionType.ENCODER_DECODER and (
-            not attn_metadata.is_all_cross_attn_metadata_set
-        ):
-            raise AttributeError(
-                "Encoder/decoder cross-attention "
-                "requires setting cross-attention "
-                "metadata attributes."
-            )
-
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-
-        if attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = self.paged_attn_impl.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size
-            )
-
-            if (key is not None) and (value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                self.paged_attn_impl.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    updated_slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-
-        if attn_type == AttentionType.DECODER:
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-
-        output = torch.empty_like(query)
-        if prefill_meta := attn_metadata.prefill_metadata:
-            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
-                assert attn_metadata.seq_lens is not None
-                self._run_sdpa_forward(
-                    output, query, key, value, prefill_meta, attn_type=attn_type
-                )
-            else:
-                # prefix-enabled attention
-                assert not self.need_mask
-                import intel_extension_for_pytorch.llm.modules as ipex_modules
-
-                output = torch.empty_like(query)
-                ipex_modules.PagedAttention.flash_attn_varlen_func(
-                    output[prefill_meta.num_decode_tokens :, :, :],
-                    query[prefill_meta.num_decode_tokens :, :, :],
-                    key_cache,
-                    value_cache,
-                    prefill_meta.prefill_query_start_loc,
-                    prefill_meta.prefill_seq_start_loc,
-                    prefill_meta.max_query_len,
-                    prefill_meta.prefill_max_seq_len,
-                    self.scale,
-                    True,
-                    prefill_meta.prefill_block_tables,
-                    self.alibi_slopes,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata."
-            )
-            # Decoding run.
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = decode_meta.get_seq_len_block_table_args(attn_type)
-
-            self.paged_attn_impl.forward_decode(
-                output[: attn_metadata.num_decode_tokens, :, :],
-                query[: attn_metadata.num_decode_tokens, :, :],
+            ops.cpu_attn_reshape_and_cache(
+                key,
+                value,
                 key_cache,
                 value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
+                attn_metadata.slot_mapping,
+                attn_metadata.isa,
             )
 
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
+        if attn_metadata.use_sdpa_prefill:
+            assert self.sinks is None, "Attention sink is unsupported in SDPA prefill"
+            num_decode_tokens = attn_metadata.num_decode_tokens
+            self._run_sdpa_forward(
+                query[num_decode_tokens:num_actual_tokens],
+                key[num_decode_tokens:num_actual_tokens],
+                value[num_decode_tokens:num_actual_tokens],
+                output[num_decode_tokens:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
+            )
+            num_actual_tokens = num_decode_tokens
+
+        if num_actual_tokens > 0:
+            ops.cpu_attention_with_kv_cache(
+                query=query[:num_actual_tokens],
+                key_cache=key_cache,
+                value_cache=value_cache,
+                output=output[:num_actual_tokens],  # type: ignore
+                query_start_loc=attn_metadata.query_start_loc,
+                seq_lens=attn_metadata.seq_lens,
+                scale=self.scale,
+                causal=attn_metadata.causal,
+                alibi_slopes=self.alibi_slopes,  # type: ignore
+                sliding_window=self.sliding_window,
+                block_table=attn_metadata.block_table,
+                softcap=self.logits_soft_cap,
+                scheduler_metadata=attn_metadata.scheduler_metadata,
+                s_aux=self.sinks,
+            )
+
+        return output
 
     def _run_sdpa_forward(
         self,
-        output: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        output: torch.Tensor,
+        attn_metadata: CPUAttentionMetadata,
+        attn_type: str,
+    ) -> torch.Tensor:
+        attn_masks = attn_metadata.sdpa_attn_masks
         if attn_masks is None:
             if self.alibi_slopes is not None:
                 attn_masks = _make_alibi_bias(
                     self.alibi_slopes,
                     query.dtype,
-                    attn_metadata.seq_lens,  # type: ignore
+                    attn_metadata.sdpa_start_loc,
                 )
-            elif self.sliding_window is not None:
+            elif self.sliding_window[0] != -1 or self.sliding_window[1] != -1:
                 assert attn_metadata.seq_lens is not None
                 attn_masks = _make_sliding_window_bias(
-                    attn_metadata.seq_lens, self.sliding_window, query.dtype
+                    attn_metadata.sdpa_start_loc,
+                    self.sliding_window[0],
+                    self.sliding_window[1],
+                    query.dtype,
                 )
             else:
-                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
-                attn_masks = [None] * len(seq_lens)
-            attn_metadata.set_attn_bias(attn_masks, attn_type)
+                attn_masks = [None] * (attn_metadata.sdpa_start_loc.size(0) - 1)  # type: ignore
+            attn_metadata.sdpa_attn_masks = attn_masks
 
         query = query.movedim(0, query.dim() - 2)
         key = key.movedim(0, key.dim() - 2)
@@ -680,21 +395,16 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
         causal_attn = attn_type == AttentionType.DECODER
 
-        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
-        # Incoming Q and KV contain decoded tokens as well, hence start at an offset
-        # equal to num_decode_tokens since decode requests appear first
-        start_q, start_kv = (
-            attn_metadata.num_decode_tokens,
-            attn_metadata.num_decode_tokens,
-        )
-        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, attn_masks):
-            end_q = start_q + seq_len_q
-            end_kv = start_kv + seq_len_kv
+        sdpa_start_loc = attn_metadata.sdpa_start_loc.numpy()  # type: ignore
+        for i in range(len(attn_masks)):
+            mask = attn_masks[i]
+            start_q = sdpa_start_loc[i]
+            end_q = sdpa_start_loc[i + 1]
             sub_out = (
-                scaled_dot_product_attention(
+                torch.nn.functional.scaled_dot_product_attention(
                     query[None, :, start_q:end_q, :],
-                    key[None, :, start_kv:end_kv, :],
-                    value[None, :, start_kv:end_kv, :],
+                    key[None, :, start_q:end_q, :],
+                    value[None, :, start_q:end_q, :],
                     attn_mask=mask,
                     dropout_p=0.0,
                     is_causal=causal_attn and mask is None,
@@ -704,17 +414,20 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
                 .movedim(query.dim() - 2, 0)
             )
             output[start_q:end_q, :, :] = sub_out
-            start_q, start_kv = end_q, end_kv
+        return output
 
 
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     dtype: torch.dtype,
-    seq_lens: list[int],
+    sdpa_start_loc: torch.Tensor,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        bias = torch.arange(seq_len, dtype=dtype)  # type: ignore
         # NOTE(zhuohan): HF uses
         #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
@@ -726,7 +439,7 @@ def _make_alibi_bias(
         bias = bias[None, :].repeat((num_heads, 1, 1))
         bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
         inf_mask = (
-            torch.empty((1, seq_len, seq_len), dtype=bias.dtype)
+            torch.empty((1, seq_len, seq_len), dtype=bias.dtype)  # type: ignore
             .fill_(-torch.inf)
             .triu_(diagonal=1)
         )
@@ -736,211 +449,37 @@ def _make_alibi_bias(
 
 
 def _make_sliding_window_bias(
-    seq_lens: list[int],
-    window_size: int | None,
+    sdpa_start_loc: torch.Tensor,
+    left_window_size: int,
+    right_window_size: int,
     dtype: torch.dtype,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        mask = torch.full(  # type: ignore
+            (1, seq_len, seq_len),  # type: ignore
             fill_value=1,
+            dtype=dtype,
         )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+
+        if right_window_size != -1:
+            mask = torch.tril(mask, diagonal=right_window_size)
+        if left_window_size != -1:
+            mask = torch.triu(mask, diagonal=-left_window_size)
         mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
+        attn_biases.append(mask)
 
     return attn_biases
 
 
-class _PagedAttention:
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        SUPPORT_HS = [32, 64, 80, 96, 112, 128, 192, 256]
-        return head_size in SUPPORT_HS, SUPPORT_HS
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[int, ...]:
-        return 2, num_blocks, block_size * num_kv_heads * head_size
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        x = 16 // kv_cache.element_size()
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        tp_rank: int = 0
-        blocksparse_local_blocks: int = 0
-        blocksparse_vert_stride: int = 0
-        blocksparse_block_size: int = 64
-        blocksparse_head_sliding_step: int = 0
-        block_size = value_cache.shape[3]
-
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank,
-            blocksparse_local_blocks,
-            blocksparse_vert_stride,
-            blocksparse_block_size,
-            blocksparse_head_sliding_step,
-        )
-
-
-class _IPEXPagedAttention(_PagedAttention):
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        return True, []
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ipex_modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache, slot_mapping.flatten().int()
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        block_size = value_cache.shape[2]
-        head_mapping = (
-            torch.arange(
-                0,
-                num_kv_heads,
-                device="cpu",
-                dtype=torch.int32,
-            )
-            .view(num_kv_heads, 1)
-            .repeat_interleave(query.size(1) // num_kv_heads)
-            .flatten()
-        )
-        ipex_modules.PagedAttention.single_query_cached_kv_attention(
-            output,
-            query.contiguous(),
-            key_cache,
-            value_cache,
-            head_mapping,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-        )
-
-
-def _get_paged_attn_impl():
-    if _use_ipex:
-        return _IPEXPagedAttention
+def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
+        return "amx"
+    elif block_size % 32 == 0:
+        return "vec"
     else:
-        return _PagedAttention
+        return "vec16"
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 15bb2f4a40acb..d9bd52d8f9800 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -3,6 +3,7 @@
 """Attention layer with FlashAttention."""
 
 from dataclasses import dataclass
+from typing import ClassVar
 
 import numpy as np
 import torch
@@ -32,11 +33,13 @@ if is_flash_attn_varlen_func_available():
         reshape_and_cache_flash,
     )
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
@@ -52,34 +55,12 @@ logger = init_logger(__name__)
 
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
-
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # NOTE(tdoublep): while in principle, FA supports
-        # MultipleOf(16), these are the block sizes that do not
-        # suffer from the NaN propagation problem described here:
-        # https://github.com/Dao-AILab/flash-attention/issues/1974
-        return [16, 32, 64]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # NOTE(tdoublep): while in principle, FA supports
+    # MultipleOf(16), these are the block sizes that do not
+    # suffer from the NaN propagation problem described here:
+    # https://github.com/Dao-AILab/flash-attention/issues/1974
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
 
     @staticmethod
     def get_name() -> str:
@@ -125,6 +106,38 @@ class FlashAttentionBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
+        if kv_cache_dtype is None:
+            return True
+        if kv_cache_dtype.startswith("fp8"):
+            return flash_attn_supports_fp8()
+        return kv_cache_dtype in ["auto"]
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(8, 0)
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if has_sink and device_capability < DeviceCapability(9, 0):
+            return "sink not supported on compute capability < 9.0"
+        return None
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -194,7 +207,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
     # to FULL_AND_PIECEWISE.
     # TODO(luka, lucas): audit FA2 as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support = (
+    _cudagraph_support = (
         AttentionCGSupport.ALWAYS
         if get_flash_attn_version() == 3
         else AttentionCGSupport.UNIFORM_BATCH
@@ -481,8 +494,6 @@ class FlashAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        FlashAttentionBackend.validate_head_size(head_size)
-
         self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         # Cache the batch invariant result for use in forward passes
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 683725b95819f..0b650e2e0d33b 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -15,7 +15,9 @@ from flashinfer import (
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
+from typing_extensions import override
 
+from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
@@ -23,6 +25,7 @@ from vllm.attention.backends.abstract import (
     MultipleOf,
 )
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -33,6 +36,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     kNvfp4Quant,
 )
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import (
     can_use_trtllm_attention,
@@ -45,6 +49,7 @@ from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    KVCacheLayoutType,
     get_kv_cache_layout,
     get_per_layer_parameters,
     infer_global_hyperparameters,
@@ -52,7 +57,6 @@ from vllm.v1.attention.backends.utils import (
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -67,7 +71,7 @@ def _get_trtllm_gen_workspace_buffer():
     global trtllm_gen_workspace_buffer
     if trtllm_gen_workspace_buffer is None:
         trtllm_gen_workspace_buffer = torch.zeros(
-            FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device="cuda"
+            envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device="cuda"
         )
     return trtllm_gen_workspace_buffer
 
@@ -158,34 +162,17 @@ def trtllm_prefill_attn_kvfp8_dequant(
 
 class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        return [64, 128, 256]
-
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # Note: Not sure for all platforms,
-        # but on Blackwell, only support a page size of
-        # 16, 32, 64
-        return [16, 32, 64]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # Note: Not sure for all platforms,
+    # but on Blackwell, only support a page size of
+    # 16, 32, 64
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -231,6 +218,41 @@ class FlashInferBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        return [64, 128, 256]
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(7, 5) and capability <= DeviceCapability(
+            12, 1
+        )
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        """FlashInfer supports sinks when TRTLLM attention is available (SM100)."""
+        from vllm.utils.flashinfer import (
+            force_use_trtllm_attention,
+            supports_trtllm_attention,
+        )
+
+        # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0)
+        if force_use_trtllm_attention() is False:
+            return False
+
+        # Check if TRTLLM is supported on this platform
+        return supports_trtllm_attention()
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> KVCacheLayoutType | None:
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability is not None and capability.major == 10:
+            return "HND"
+        return None
+
 
 @dataclass
 class FlashInferMetadata:
@@ -268,10 +290,6 @@ class FlashInferMetadata:
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-    )
-
     reorder_batch_threshold: int = 1
 
     def __init__(
@@ -328,7 +346,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
         self.head_dim = self.kv_cache_spec.head_size
-        FlashInferBackend.validate_head_size(self.head_dim)
         self.page_size = self.kv_cache_spec.block_size
 
         self.cache_dtype = self.cache_config.cache_dtype
@@ -350,6 +367,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         else:
             self.q_data_type = self.model_config.dtype
 
+        # Prefer TRTLLM attention for decoding in all cases.
+        # This allows us to use AttentionCGSupport.UNIFORM_BATCH mode.
+        self.use_trtllm_decode_attention = can_use_trtllm
         self._init_reorder_batch_threshold(1, supports_spec_as_decode=can_use_trtllm)
 
         self._cascade_wrapper = None  # Wrapper for cascade attention
@@ -407,9 +427,27 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 "passing --block-size 32 or --block-size 64."
             )
 
+    @classmethod
+    @override
+    def get_cudagraph_support(
+        cls: type["FlashInferMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        has_trtllm_support = can_use_trtllm_attention(
+            num_qo_heads=vllm_config.model_config.get_num_attention_heads(
+                vllm_config.parallel_config
+            ),
+            num_kv_heads=kv_cache_spec.num_kv_heads,
+        )
+        if has_trtllm_support:
+            return AttentionCGSupport.UNIFORM_BATCH
+        else:
+            return AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE
+            buffer_size = envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE
             if vllm_is_batch_invariant():
                 buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT
             self._workspace_buffer = torch.zeros(
@@ -568,17 +606,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )
-        decode_use_trtllm = use_trtllm_attention(
-            self.num_qo_heads,
-            self.num_kv_heads,
-            num_decode_tokens,
-            max_seq_len,
-            self.cache_dtype,
-            self.q_data_type,
-            is_prefill=False,
-            has_sinks=self.has_sinks,
-            has_spec=uses_spec_reorder,
-        )
+        decode_use_trtllm = self.use_trtllm_decode_attention
 
         if not (prefill_use_trtllm and decode_use_trtllm):
             if self.has_sinks:
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 9af63831cecba..e53cd0d8af4f2 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -4,6 +4,7 @@
 
 import math
 from dataclasses import dataclass
+from typing import ClassVar
 
 import torch
 import torch._dynamo.decorators
@@ -24,6 +25,7 @@ from vllm.attention.backends.abstract import (
     is_quantized_kv_cache,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -71,14 +73,12 @@ def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
 
 class FlexAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        return  # FlexAttention supports any head size
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
     @staticmethod
     def get_name() -> str:
@@ -106,6 +106,10 @@ class FlexAttentionBackend(AttentionBackend):
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
 
 # @torch.compile(fullgraph=True, mode="reduce-overhead")
 def physical_to_logical_mapping(
@@ -720,7 +724,6 @@ class FlexAttentionImpl(AttentionImpl):
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("FlexAttention does not support kv sharing yet.")
 
-        FlexAttentionBackend.validate_head_size(head_size)
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
                 "FlexAttention does not support quantized kv-cache. Yet"
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 2ca19646911ec..69b5a6fb48564 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -59,7 +59,7 @@ class GDNAttentionMetadata:
 
 
 class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]):
-    cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
 
     reorder_batch_threshold: int = 1
 
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 909af09be255a..8e949e53330c1 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -123,7 +123,7 @@ class Mamba1AttentionMetadataBuilder(
         elif (
             num_decodes > 0
             and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 4bc1057333a50..888734e5d2b6b 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -302,7 +302,7 @@ class Mamba2AttentionMetadataBuilder(
 
         elif (
             num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             # Pad state tensor for CUDA graph
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 49d7d6c31b9a0..0d875565fc99a 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -20,7 +20,7 @@ M = TypeVar("M")
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
     reorder_batch_threshold: int = 1
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index e38f7bcfa44e1..467c01cd9d069 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -196,8 +196,8 @@ from typing import ClassVar, Generic, TypeVar
 import torch
 from tqdm import tqdm
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
@@ -308,25 +308,13 @@ class MLACommonBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    def is_mla(cls) -> bool:
+        return True
 
 
 @dataclass
@@ -425,8 +413,10 @@ class MLACommonMetadata(Generic[D]):
     ) = None
 
     def __post_init__(self):
-        if self.head_dim is not None:
-            MLACommonBackend.validate_head_size(self.head_dim)
+        if self.head_dim is not None and not MLACommonBackend.supports_head_size(
+            self.head_dim
+        ):
+            raise ValueError(f"Head dimension {self.head_dim} is not supported by MLA.")
 
 
 M = TypeVar("M", bound=MLACommonMetadata)
@@ -463,12 +453,6 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
     )
 
 
-# Currently 394MB, this can be tuned based on GEMM sizes used.
-# Chosen to be the same as sglang:
-#  https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
-
-
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
@@ -600,7 +584,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         if self._use_fi_prefill:
             self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=device
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device,
             )
 
             self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
@@ -612,7 +598,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         if self._use_trtllm_ragged_prefill:
             self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=device
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device,
             )
 
         if self._use_cudnn_prefill:
@@ -2010,7 +1998,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 decode_q, kv_cache, attn_metadata, layer
             )
 
-            # recorect dcp attn_out with lse.
+            # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
                 attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
 
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index c35e238eac4c0..60cb5022a55eb 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -13,7 +13,9 @@ from vllm.attention.backends.abstract import (
     MultipleOf,
     is_quantized_kv_cache,
 )
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -27,12 +29,20 @@ logger = init_logger(__name__)
 
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
 
 class CutlassMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "CUTLASS_MLA"
@@ -45,9 +55,9 @@ class CutlassMLABackend(MLACommonBackend):
     def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
         return CutlassMLAMetadataBuilder
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [128]
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
 
 
 class SM100Workspace:
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 79b89c7890a28..7794e89cc0a94 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -10,6 +10,7 @@ from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
+    MultipleOf,
     is_quantized_kv_cache,
 )
 from vllm.attention.utils.fa_utils import (
@@ -17,10 +18,12 @@ from vllm.attention.utils.fa_utils import (
     get_flash_attn_version,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -37,6 +40,10 @@ logger = init_logger(__name__)
 
 
 class FlashAttnMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
@@ -49,6 +56,26 @@ class FlashAttnMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["FlashAttnMLAImpl"]:
         return FlashAttnMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 9
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if not flash_attn_supports_mla():
+            return "FlashAttention MLA not supported on this device"
+        return None
+
 
 @dataclass
 class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
@@ -65,7 +92,7 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
 
 
 class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.VARLEN
     reorder_batch_threshold: int = 512  # process small prefills with decode pathway
 
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index ebbcfd0eaa2fb..52bb19e039e45 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -6,8 +6,14 @@ from typing import ClassVar
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
-from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+)
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -15,7 +21,7 @@ from vllm.v1.attention.backends.mla.common import (
     MLACommonMetadataBuilder,
     QueryLenSupport,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.attention.backends.utils import AttentionCGSupport, KVCacheLayoutType
 
 logger = init_logger(__name__)
 
@@ -23,11 +29,19 @@ FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
 
 
 class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
 
 
 class FlashInferMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_MLA"
@@ -41,8 +55,12 @@ class FlashInferMLABackend(MLACommonBackend):
         return FlashInferMLAMetadataBuilder
 
     @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return [32, 64]
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return "HND"
 
 
 g_fi_workspace = torch.zeros(
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 708bb9d63839d..3aab1f9bb7fb6 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -13,10 +13,12 @@ from vllm.attention.ops.flashmla import (
     is_flashmla_dense_supported,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -36,6 +38,14 @@ logger = init_logger(__name__)
 
 
 class FlashMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
@@ -48,9 +58,30 @@ class FlashMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["FlashMLAImpl"]:
         return FlashMLAImpl
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [64]
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if use_sparse:
+            from vllm.attention.ops.flashmla import is_flashmla_sparse_supported
+
+            return is_flashmla_sparse_supported()[1]
+        else:
+            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+
+            return is_flashmla_dense_supported()[1]
 
 
 @dataclass
@@ -65,7 +96,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
     reorder_batch_threshold: int = 128  # process small prefills with decode pathway
     # ^ TODO(matt): tune this
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index bf76549de1ce8..5fe9c69d35007 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -10,6 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionLayer,
+    MultipleOf,
 )
 from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.flashmla import (
@@ -18,8 +19,10 @@ from vllm.attention.ops.flashmla import (
     get_mla_metadata,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl
@@ -51,6 +54,9 @@ structured as:
 
 class FlashMLASparseBackend(AttentionBackend):
     accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
     @staticmethod
     def get_name() -> str:
@@ -64,6 +70,22 @@ class FlashMLASparseBackend(AttentionBackend):
     def get_impl_cls() -> type["FlashMLASparseImpl"]:
         return FlashMLASparseImpl
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return True
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -79,14 +101,6 @@ class FlashMLASparseBackend(AttentionBackend):
         else:
             return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [576]
-
 
 @dataclass
 class FlashMLASparseMetadata:
@@ -234,7 +248,7 @@ def triton_convert_req_index_to_global_index(
 
 @dataclass
 class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(
         self,
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index f3c5bb7328712..37aa5dad89a0e 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -23,6 +23,8 @@ logger = init_logger(__name__)
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 128]
@@ -46,10 +48,6 @@ class DeepseekV32IndexerBackend(AttentionBackend):
     def get_kv_cache_stride_order() -> tuple[int, ...]:
         return (0, 1, 2)
 
-    @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return [64]
-
 
 @dataclass
 class DeepseekV32IndexerPrefillChunkMetadata:
@@ -208,7 +206,7 @@ def split_prefill_chunks(
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 5757aeadba056..e1864526f02cc 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -55,7 +55,7 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     # TODO(luka, lucas): audit this as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 781f77e96319a..54ad3acb93ed2 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,23 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
     is_quantized_kv_cache,
 )
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.attention.ops.triton_flash_attention import triton_attention
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
-from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -28,6 +27,9 @@ logger = init_logger(__name__)
 
 
 class TritonMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_MLA"
@@ -36,6 +38,10 @@ class TritonMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["TritonMLAImpl"]:
         return TritonMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
 
 class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
     can_return_lse_for_decode: bool = True
@@ -89,54 +95,17 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
                 "TritonMLA V1 with FP8 KV cache not yet supported"
             )
 
-        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
-        self.triton_fa_func = triton_attention if HAS_TRITON else None
-
-    def _flash_attn_varlen_diff_headdims_rocm(
-        self, q, k, v, softmax_scale=None, **kwargs
-    ):
-        assert self.triton_fa_func is not None
-
-        # Triton Attention requires a padded V
-        padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], value=0)
-        # The output of triton_attention is a tuple of
-        # [output_tensor, encoded_softmax] where encoded_softmax is always None
-        output_tensor, _ = self.triton_fa_func(
-            q,
-            k,
-            padded_v,
-            None,  # output
-            kwargs["cu_seqlens_q"],
-            kwargs["cu_seqlens_k"],
-            kwargs["max_seqlen_q"],
-            kwargs["max_seqlen_k"],
-            kwargs["causal"],
-            softmax_scale,
-            None,  # bias
-        )
-
-        return output_tensor
-
     def _flash_attn_varlen_diff_headdims(
         self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
     ):
-        if (
-            current_platform.is_rocm()
-            and self.use_triton_flash_attn
-            and not return_softmax_lse
-        ):
-            return self._flash_attn_varlen_diff_headdims_rocm(
-                q, k, v, softmax_scale=softmax_scale, **kwargs
-            )
-        else:
-            return super()._flash_attn_varlen_diff_headdims(
-                q,
-                k,
-                v,
-                return_softmax_lse=return_softmax_lse,
-                softmax_scale=softmax_scale,
-                **kwargs,
-            )
+        return super()._flash_attn_varlen_diff_headdims(
+            q,
+            k,
+            v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
 
     def _forward_decode(
         self,
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index e8d3758a6395a..c7f925817a6a8 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -3,6 +3,7 @@
 """Attention layer with AiterFlashAttention."""
 
 from dataclasses import dataclass
+from typing import ClassVar
 
 import torch
 
@@ -30,15 +31,14 @@ _CP_TOKENS_PER_ITER_ROCM = 32 * 1024
 
 if current_platform.is_rocm():
     import aiter
-    from aiter.ops.triton.utils.device_info import get_num_sms
 
     from vllm.triton_utils import tl, triton
 
     def block_size(x, head_dim):
         return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
 
-    def num_programs(head_dim):
-        return min(head_dim, get_num_sms())
+    def num_programs(total_tokens):
+        return min(total_tokens, current_platform.get_cu_count())
 
     @triton.jit
     def cp_mha_gather_cache_kernel(
@@ -57,11 +57,11 @@ if current_platform.is_rocm():
         x,
         max_block_num,
         num_tokens,
+        num_programs,
         DEQUANT: tl.constexpr,
         PAGE_SIZE: tl.constexpr,
         CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
-        NUM_PRGMS: tl.constexpr,
     ):
         bid = tl.program_id(0)
         col_offsets = tl.arange(0, BLOCK_SIZE)
@@ -69,7 +69,7 @@ if current_platform.is_rocm():
             k_scale = tl.load(k_scale_ptr)
             v_scale = tl.load(v_scale_ptr)
 
-        for token_id in tl.range(bid, num_tokens, NUM_PRGMS):
+        for token_id in tl.range(bid, num_tokens, num_programs):
             key_ptr_offset = key_ptr + token_id * head_size * num_heads
             value_ptr_offset = value_ptr + token_id * head_size * num_heads
             batch_idx = tl.load(token_to_batch_ptr + token_id)
@@ -161,11 +161,11 @@ if current_platform.is_rocm():
             x,
             block_tables.size(1),
             total_tokens,
+            NUM_PRGMS,
             DEQUANT=dequant,
             PAGE_SIZE=page_size,
             CACHE_FORMAT=kv_cache_layout,
             BLOCK_SIZE=BLOCK_SIZE,
-            NUM_PRGMS=NUM_PRGMS,
         )
 
 
@@ -250,7 +250,7 @@ class AiterFlashAttentionMetadata:
 class AiterFlashAttentionMetadataBuilder(
     AttentionMetadataBuilder[AiterFlashAttentionMetadata]
 ):
-    cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     reorder_batch_threshold: int = 1
 
     def __init__(
@@ -445,31 +445,13 @@ class AiterFlashAttentionMetadataBuilder(
 
 class AiterFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [64, 128, 256]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
@@ -531,8 +513,6 @@ class AiterFlashAttentionImpl(AttentionImpl):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        AiterFlashAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 57ba4dc78d9fd..6dfdfc19ccba1 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -63,7 +63,7 @@ class RocmAttentionMetadata:
 
 
 class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(
         self,
@@ -152,10 +152,7 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat
 
 class RocmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -163,12 +160,11 @@ class RocmAttentionBackend(AttentionBackend):
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
+        if not cls.supports_head_size(head_size):
             attn_type = cls.__name__.removesuffix("Backend")
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
+                f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
                 "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index 22ad1054b35e1..de0cb73db0917 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -81,7 +81,7 @@ class ShortConvAttentionMetadataBuilder(
         elif (
             num_decodes > 0
             and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 0c0222d6152fb..1bf38ed225a4c 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -4,7 +4,7 @@
 
 import ast
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -30,31 +30,13 @@ logger = init_logger(__name__)
 
 class TreeAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "TREE_ATTN"
@@ -331,8 +313,6 @@ class TreeAttentionImpl(AttentionImpl):
         else:
             self.sliding_window = (sliding_window - 1, 0)
 
-        TreeAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 0590a87bf8e5f..889c79db18ef5 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -18,12 +18,14 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -65,7 +67,7 @@ class TritonAttentionMetadata:
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(
         self,
@@ -147,25 +149,18 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
 
 class TritonAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        # Triton Attention supports any head size above 32
-        if head_size < 32:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention."
-                f"Head sizes need to be larger or equal 32 for this backend. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -195,6 +190,18 @@ class TritonAttentionBackend(AttentionBackend):
     def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
         return TritonAttentionMetadataBuilder
 
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size >= 32
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
 
 class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
@@ -237,8 +244,6 @@ class TritonAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        TritonAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 07dfbc766acd1..fd37a665cf05f 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -244,7 +244,8 @@ class AttentionCGSupport(enum.Enum):
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention (default: no).
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+    # Do not access directly. Call get_cudagraph_support() instead.
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
@@ -263,9 +264,18 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         self.vllm_config = vllm_config
         self.device = device
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AttentionMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        """Get the cudagraph support level of this builder class."""
+        return cls._cudagraph_support
+
     def _init_reorder_batch_threshold(
         self,
-        reorder_batch_threshold: int = 1,
+        reorder_batch_threshold: int | None = 1,
         supports_spec_as_decode: bool = False,
         supports_dcp_with_varlen: bool = False,
     ) -> None:
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 81bdbd641429a..d15d79417cc61 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -41,10 +41,8 @@ logger = init_logger(__name__)
 
 class XFormersAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -80,22 +78,6 @@ class XFormersAttentionBackend(AttentionBackend):
             256,
         ]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "XFORMERS"
@@ -305,8 +287,6 @@ class XFormersAttentionImpl(AttentionImpl):
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
 
-        XFormersAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 866136648bcba..20fdb3446404b 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -14,6 +14,7 @@ if TYPE_CHECKING:
     import numpy.typing as npt
     import torch
 
+    from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
     from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
     from vllm.lora.request import LoRARequest
     from vllm.multimodal.inputs import MultiModalFeatureSpec
@@ -21,6 +22,7 @@ if TYPE_CHECKING:
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
 else:
+    ECConnectorMetadata = object
     KVConnectorMetadata = object
     LoRARequest = object
     MultiModalFeatureSpec = object
@@ -188,6 +190,9 @@ class SchedulerOutput:
     # KV Cache Connector metadata.
     kv_connector_metadata: KVConnectorMetadata | None = None
 
+    # EC Cache Connector metadata
+    ec_connector_metadata: ECConnectorMetadata | None = None
+
 
 @dataclass
 class GrammarOutput:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 46dc1071b8395..4fcc7955df195 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,6 +7,11 @@ from collections.abc import Iterable
 from typing import Any
 
 from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
@@ -14,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorRole,
     SupportsHMA,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -104,6 +110,11 @@ class Scheduler(SchedulerInterface):
             self.kv_events_config,
             self.parallel_config.data_parallel_rank,
         )
+        self.ec_connector = None
+        if self.vllm_config.ec_transfer_config is not None:
+            self.ec_connector = ECConnectorFactory.create_connector(
+                config=self.vllm_config, role=ECConnectorRole.SCHEDULER
+            )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
@@ -230,12 +241,14 @@ class Scheduler(SchedulerInterface):
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
+            external_load_encoder_input: list[int] = []
             new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
                 (
                     encoder_inputs_to_schedule,
                     num_new_tokens,
                     new_encoder_compute_budget,
+                    external_load_encoder_input,
                 ) = self._try_schedule_encoder_inputs(
                     request,
                     request.num_computed_tokens,
@@ -287,6 +300,20 @@ class Scheduler(SchedulerInterface):
                             ]
                             req_to_new_blocks.pop(preempted_req.request_id)
                             num_scheduled_tokens.pop(preempted_req.request_id)
+                            scheduled_spec_decode_tokens.pop(
+                                preempted_req.request_id, None
+                            )
+                            preempted_encoder_inputs = scheduled_encoder_inputs.pop(
+                                preempted_req.request_id, None
+                            )
+                            if preempted_encoder_inputs:
+                                # Restore encoder compute budget if the preempted
+                                # request had encoder inputs scheduled in this step.
+                                num_tokens_to_restore = sum(
+                                    preempted_req.get_num_encoder_tokens(i)
+                                    for i in preempted_encoder_inputs
+                                )
+                                encoder_compute_budget += num_tokens_to_restore
                             req_index -= 1
                     else:
                         preempted_req = self.running.pop()
@@ -342,6 +369,11 @@ class Scheduler(SchedulerInterface):
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_compute_budget = new_encoder_compute_budget
+            if external_load_encoder_input:
+                for i in external_load_encoder_input:
+                    self.encoder_cache_manager.allocate(request, i)
+                    if self.ec_connector is not None:
+                        self.ec_connector.update_state_after_alloc(request, i)
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
@@ -445,6 +477,7 @@ class Scheduler(SchedulerInterface):
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
+                external_load_encoder_input = []
                 new_encoder_compute_budget = encoder_compute_budget
 
                 # KVTransfer: loading remote KV, do not allocate for new work.
@@ -480,6 +513,7 @@ class Scheduler(SchedulerInterface):
                             encoder_inputs_to_schedule,
                             num_new_tokens,
                             new_encoder_compute_budget,
+                            external_load_encoder_input,
                         ) = self._try_schedule_encoder_inputs(
                             request,
                             num_computed_tokens,
@@ -583,7 +617,12 @@ class Scheduler(SchedulerInterface):
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_compute_budget = new_encoder_compute_budget
-
+                # Allocate for external load encoder cache
+                if external_load_encoder_input:
+                    for i in external_load_encoder_input:
+                        self.encoder_cache_manager.allocate(request, i)
+                        if self.ec_connector is not None:
+                            self.ec_connector.update_state_after_alloc(request, i)
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
             self.waiting.prepend_requests(skipped_waiting_requests)
@@ -591,6 +630,7 @@ class Scheduler(SchedulerInterface):
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+
         assert token_budget >= 0
         assert len(self.running) <= self.max_num_running_reqs
         # Since some requests in the RUNNING queue may not be scheduled in
@@ -653,8 +693,18 @@ class Scheduler(SchedulerInterface):
         # 2. Wrap up all the KV cache load / save ops into an opaque object
         # 3. Clear the internal states of the connector
         if self.connector is not None:
-            meta = self.connector.build_connector_meta(scheduler_output)
+            meta: KVConnectorMetadata = self.connector.build_connector_meta(
+                scheduler_output
+            )
             scheduler_output.kv_connector_metadata = meta
+
+        # Build the connector meta for ECConnector
+        if self.ec_connector is not None:
+            ec_meta: ECConnectorMetadata = self.ec_connector.build_connector_meta(
+                scheduler_output
+            )
+            scheduler_output.ec_connector_metadata = ec_meta
+
         with record_function_or_nullcontext("schedule: update_after_schedule"):
             self._update_after_schedule(scheduler_output)
         return scheduler_output
@@ -755,7 +805,7 @@ class Scheduler(SchedulerInterface):
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
-    ) -> tuple[list[int], int, int]:
+    ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
         and update `num_new_tokens` and encoder token budget accordingly.
@@ -765,6 +815,7 @@ class Scheduler(SchedulerInterface):
         in this step, i.e.,
         [num_computed_tokens, num_computed_tokens + num_new_tokens).
         - It is not already computed and stored in the encoder cache.
+        - It is not exist on remote encoder cache (via ECConnector)
         - There is sufficient encoder token budget to process it.
         - The encoder cache has space to store it.
 
@@ -776,12 +827,16 @@ class Scheduler(SchedulerInterface):
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_compute_budget
+            return [], num_new_tokens, encoder_compute_budget, []
         encoder_inputs_to_schedule: list[int] = []
         mm_features = request.mm_features
         assert mm_features is not None
         assert len(mm_features) > 0
+        external_load_encoder_input = []
 
+        # Check remote cache first
+        if self.ec_connector is not None:
+            remote_cache_has_item = self.ec_connector.has_caches(request)
         # NOTE: since scheduler operates on the request level (possibly with
         # multiple encoder inputs per request), we need to create temporary
         # trackers for accounting at the encoder input level.
@@ -862,6 +917,12 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
+            if self.ec_connector is not None and remote_cache_has_item[i]:
+                mm_hashes_to_schedule.add(request.mm_features[i].identifier)
+                external_load_encoder_input.append(i)
+                num_tokens_to_schedule += num_encoder_tokens
+                continue
+
             num_tokens_to_schedule += num_encoder_tokens
             encoder_compute_budget -= num_encoder_tokens
             mm_hashes_to_schedule.add(request.mm_features[i].identifier)
@@ -871,6 +932,7 @@ class Scheduler(SchedulerInterface):
             encoder_inputs_to_schedule,
             num_new_tokens,
             encoder_compute_budget,
+            external_load_encoder_input,
         )
 
     def get_grammar_bitmask(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index d27d13840989e..6224af5700b7b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -281,11 +281,11 @@ class LLMEngine:
             return []
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        with record_function_or_nullcontext("llm_genine step: get_output"):
+        with record_function_or_nullcontext("llm_engine step: get_output"):
             outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
-        with record_function_or_nullcontext("llm_genine step: process_outputs"):
+        with record_function_or_nullcontext("llm_engine step: process_outputs"):
             iteration_stats = IterationStats() if self.log_stats else None
             processed_outputs = self.output_processor.process_outputs(
                 outputs.outputs,
@@ -295,11 +295,11 @@ class LLMEngine:
             self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
-        with record_function_or_nullcontext("llm_genine step: abort_requests"):
+        with record_function_or_nullcontext("llm_engine step: abort_requests"):
             self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
         # 4) Record stats
-        with record_function_or_nullcontext("llm_genine step: record_stats"):
+        with record_function_or_nullcontext("llm_engine step: record_stats"):
             if self.logger_manager is not None and outputs.scheduler_stats is not None:
                 self.logger_manager.record(
                     scheduler_stats=outputs.scheduler_stats,
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 4c5955d7ee2e5..b618d23472651 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -74,7 +74,12 @@ class LogprobsProcessor:
 
         token_ids_lst, logprobs_lst, ranks_lst, _ = logprobs_lists
 
-        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst, token_ids_lst):
+        for rank_np, logprobs_np, token_ids_np in zip(
+            ranks_lst, logprobs_lst, token_ids_lst
+        ):
+            rank = rank_np.tolist()
+            logprobs = logprobs_np.tolist()
+            token_ids = token_ids_np.tolist()
             # Detokenize (non-incrementally).
             decoded_tokens = (
                 NONES
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 26ee10d2b9bbf..59aacd1963076 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -97,12 +97,21 @@ class ParentRequest:
         child_request_id: str,
         completion_output: CompletionOutput,
     ) -> tuple[str, list[CompletionOutput], bool]:
+        already_finished_and_returned: bool = False
         if completion_output.finished():
-            self.child_requests.remove(child_request_id)
+            if child_request_id in self.child_requests:
+                self.child_requests.remove(child_request_id)
+            else:
+                # child request ID is not available in child_requests
+                # which means the request had finished in previous
+                # batch step and returned to the client earlier
+                already_finished_and_returned = True
 
         if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
-            # If streaming, just return the current output.
-            outputs = [completion_output]
+            # If streaming, just return the current output
+            #
+            # DO NOT output finished and already returned child request to client again
+            outputs = [] if already_finished_and_returned else [completion_output]
         else:
             # If not streaming, aggregate the n final outputs.
             self.output_aggregator[completion_output.index] = completion_output
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c49fd1bde8b98..f2d992403e1a8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -270,6 +270,12 @@ class Processor:
             raise ValueError(
                 f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
             )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
 
         if backend.startswith("xgrammar"):
             # xgrammar with no fallback
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 1e913876b7635..db8303fcec501 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -294,12 +294,6 @@ class Executor(ABC):
         """Reset the multi-modal cache in each worker."""
         self.collective_rpc("reset_mm_cache")
 
-    def start_profile(self) -> None:
-        self.collective_rpc("start_profile")
-
-    def stop_profile(self) -> None:
-        self.collective_rpc("stop_profile")
-
     def sleep(self, level: int = 1):
         if self.is_sleeping:
             logger.warning("Executor is already sleeping.")
diff --git a/vllm/v1/kv_offload/arc_manager.py b/vllm/v1/kv_offload/arc_manager.py
new file mode 100644
index 0000000000000..45b166d6797f8
--- /dev/null
+++ b/vllm/v1/kv_offload/arc_manager.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+
+
+class ARCOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager implementing the ARC (Adaptive Replacement Cache)
+    eviction policy with a pluggable backend.
+
+    Data Structures:
+        T1: Recent cache containing blocks accessed once.
+        T2: Frequent cache containing blocks accessed multiple times.
+        B1/B2: Ghost lists tracking recently evicted blocks from T1/T2.
+        target_t1_size: Adaptive target size for the T1 partition.
+
+    Algorithm Flow:
+        1. Cache lookup (lookup):
+           Searches T1 and T2 for block hashes and counts consecutive hits
+           until a miss or non-ready block is encountered.
+
+        2. Cache touch (touch) - Adaptive Learning:
+           For each block_hash (in reverse order):
+           - If in T1: Move to T2 (promotion from recent to frequent).
+           - If in T2: Move to MRU position (end of queue).
+           - If in B1 ghost list: Increase target_t1_size.
+           - If in B2 ghost list: Decrease target_t1_size.
+
+        3. Block eviction (prepare_store) - Adaptive Replacement:
+           Determines eviction source based on adaptive target:
+           - If T1 size > target_t1_size: Evict from T1, add to B1.
+           - Otherwise: Evict from T2, add to B2.
+           Finally, bound each ghost list size.
+
+        4. Block insertion (prepare_store):
+           New blocks are always inserted into T1 and removed from B1/B2 if
+           present. Blocks may later be promoted to T2 during touch operations.
+
+    Adaptive Behavior:
+        The algorithm self-tunes the recency vs. frequency trade-off:
+        - B1 hit: Recent access patterns matter more → increase T1.
+        - B2 hit: Frequent access patterns matter more → decrease T1.
+    """
+
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        self.target_t1_size: float = 0.0
+        self.t1: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.t2: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        # block_hash -> None (only care about presence)
+        self.b1: OrderedDict[BlockHash, None] = OrderedDict()
+        self.b2: OrderedDict[BlockHash, None] = OrderedDict()
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
+        self.cache_capacity: int = self.backend.get_num_free_blocks()
+
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found in cache"
+            assert block.is_ready, f"Block {block_hash!r} is not ready for reading"
+
+            block.ref_cnt += 1
+            blocks.append(block)
+
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if block_hash in self.t1:
+                block = self.t1.pop(block_hash)
+                if not block.is_ready:
+                    # block was just prepared to be stored, not really touched twice
+                    self.t1.move_to_end(block_hash)
+                else:
+                    self.t2[block_hash] = block
+
+            elif block_hash in self.t2:
+                self.t2.move_to_end(block_hash)
+
+            elif block_hash in self.b1:
+                delta = max(1, len(self.b2) / len(self.b1))
+                self.target_t1_size = min(
+                    self.target_t1_size + delta, self.cache_capacity
+                )
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b1.move_to_end(block_hash)
+
+            elif block_hash in self.b2:
+                delta = max(1, len(self.b1) / len(self.b2))
+                self.target_t1_size = max(self.target_t1_size - delta, 0)
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b2.move_to_end(block_hash)
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found"
+            assert block.ref_cnt > 0, f"Block {block_hash!r} ref_cnt is already 0"
+
+            block.ref_cnt -= 1
+
+    def prepare_store(
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
+        block_hashes_to_store = []
+        for block_hash in block_hashes:
+            if block_hash not in self.t1 and block_hash not in self.t2:
+                block_hashes_to_store.append(block_hash)
+
+        if not block_hashes_to_store:
+            return PrepareStoreOutput(
+                block_hashes_to_store=[],
+                store_spec=self.backend.get_load_store_spec([], []),
+                block_hashes_evicted=[],
+            )
+
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
+
+        to_evict = []
+        while num_blocks_to_evict > 0:
+            block_to_evict = None
+            if len(self.t1) >= int(self.target_t1_size):
+                # try to evict the least recently used (oldest) block from T1
+                for block_hash, block in self.t1.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t1
+                        eviction_b = self.b1
+                        break
+            if not block_to_evict:
+                # try to evict the least recently used (oldest) block from T2
+                for block_hash, block in self.t2.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t2
+                        eviction_b = self.b2
+                        break
+                else:
+                    # cannot evict enough blocks, cache is full of in-use items
+                    return None
+
+            block_hash, block = block_to_evict
+            del eviction_t[block_hash]
+            eviction_b[block_hash] = None
+            to_evict.append(block_hash)
+            self.backend.free(block)
+            num_blocks_to_evict -= 1
+
+        for b in [self.b1, self.b2]:
+            for i in range(len(b) - self.cache_capacity):
+                b.popitem(last=False)
+
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
+
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store), (
+            "Backend did not allocate the expected number of blocks"
+        )
+
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.t1[block_hash] = block
+
+            self.b1.pop(block_hash, None)
+            self.b2.pop(block_hash, None)
+
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
+
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
+
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+
+        if success:
+            for block_hash in block_hashes:
+                block = self.t1.get(block_hash) or self.t2.get(block_hash)
+
+                if block is not None and not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.t1.pop(block_hash, None)
+
+                if block is None:
+                    block = self.t2.pop(block_hash, None)
+
+                if block is not None and not block.is_ready:
+                    self.backend.free(block)
+
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index f765d19ea0175..4b1bbe6f0cc2a 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
@@ -33,18 +34,32 @@ class CPUOffloadingSpec(OffloadingSpec):
         # worker-side
         self._handler: OffloadingHandler | None = None
 
+        self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
+
     def get_manager(self) -> OffloadingManager:
         if not self._manager:
             kv_events_config = self.vllm_config.kv_events_config
             enable_events = (
                 kv_events_config is not None and kv_events_config.enable_kv_cache_events
             )
-            self._manager = LRUOffloadingManager(
-                CPUBackend(
-                    block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
-                ),
-                enable_events=enable_events,
+
+            backend = CPUBackend(
+                block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
             )
+
+            if self.eviction_policy == "lru":
+                self._manager = LRUOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            elif self.eviction_policy == "arc":
+                self._manager = ARCOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            else:
+                raise ValueError(
+                    f"Unknown eviction policy: {self.eviction_policy}. "
+                    f"Supported policies: lru, arc"
+                )
         return self._manager
 
     def get_handlers(
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index b5cba96e1026f..e32d5bb608b1d 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -5,8 +5,11 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, NamedTuple
 
+import numpy as np
 import torch
 
+from vllm.v1.core.sched.output import SchedulerOutput
+
 if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 else:
@@ -15,11 +18,11 @@ else:
 
 class LogprobsLists(NamedTuple):
     # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
-    logprob_token_ids: list[list[int]]
+    logprob_token_ids: np.ndarray
     # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
-    logprobs: list[list[float]]
+    logprobs: np.ndarray
     # [num_reqs x num_generated_tokens]
-    sampled_token_ranks: list[int]
+    sampled_token_ranks: np.ndarray
     # [num_reqs]
     # Used for slicing the logprobs in cases like speculative
     # decoding where the number of generated tokens may be
@@ -60,9 +63,9 @@ class LogprobsTensors(NamedTuple):
 
     def tolists(self, cu_num_generated_tokens: list[int] | None = None):
         return LogprobsLists(
-            self.logprob_token_ids.tolist(),
-            self.logprobs.tolist(),
-            self.selected_token_ranks.tolist(),
+            self.logprob_token_ids.cpu().numpy(),
+            self.logprobs.cpu().numpy(),
+            self.selected_token_ranks.cpu().numpy(),
             cu_num_generated_tokens,
         )
 
@@ -135,6 +138,13 @@ class KVConnectorOutput:
         )
 
 
+@dataclass
+class ECConnectorOutput:
+    # [mm_hash]
+    finished_sending: set[str] | None = None
+    finished_recving: set[str] | None = None
+
+
 # ModelRunnerOutput is serialized and sent to the scheduler process.
 # This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
@@ -166,6 +176,8 @@ class ModelRunnerOutput:
 
     kv_connector_output: KVConnectorOutput | None = None
 
+    ec_connector_output: ECConnectorOutput | None = None
+
     # req_id -> num_nans_in_logits
     num_nans_in_logits: dict[str, int] | None = None
 
@@ -191,6 +203,41 @@ class DraftTokenIds:
     draft_token_ids: list[list[int]]
 
 
+def make_empty_encoder_model_runner_output(
+    scheduler_output: "SchedulerOutput",
+) -> ModelRunnerOutput:
+    """
+    Create a ModelRunnerOutput stub that contains the correct
+    per-request bookkeeping but no generated data yet.
+    """
+    if not scheduler_output.num_scheduled_tokens:
+        return EMPTY_MODEL_RUNNER_OUTPUT
+
+    # Convert to list so we get a deterministic, indexable sequence
+    req_ids: list[str] = list(scheduler_output.num_scheduled_tokens.keys())
+
+    # Give every request its own contiguous index
+    req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
+
+    # No tokens generated yet ⇒ one empty list per request
+    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]
+
+    # Pooler outputs are not available yet ⇒ use None placeholders
+    pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
+
+    return ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_id_to_index,
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=pooler_output,
+        kv_connector_output=None,
+        ec_connector_output=None,
+        num_nans_in_logits=None,
+    )
+
+
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
     req_ids=[],
     req_id_to_index={},
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index eb537eae6c904..5992c4066c9cb 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,7 +5,7 @@ import inspect
 import itertools
 from abc import abstractmethod
 from collections.abc import Sequence
-from functools import partial
+from functools import lru_cache, partial
 from typing import TYPE_CHECKING
 
 import torch
@@ -216,11 +216,17 @@ def build_logitsprocs(
     )
 
 
+cached_load_custom_logitsprocs = lru_cache(_load_custom_logitsprocs)
+
+
 def validate_logits_processors_parameters(
     logits_processors: Sequence[str | type[LogitsProcessor]] | None,
     sampling_params: SamplingParams,
 ):
-    for logits_procs in _load_custom_logitsprocs(logits_processors):
+    logits_processors = (
+        tuple(logits_processors) if logits_processors is not None else None
+    )
+    for logits_procs in cached_load_custom_logitsprocs(logits_processors):
         logits_procs.validate_params(sampling_params)
 
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 75a4140fd6552..beef5203e0394 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -150,11 +150,15 @@ class EagleProposer:
         )
 
         # Determine allowed attention backends once during initialization.
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
-            # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
-            if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"):
+            # ROCM_AITER_FA is an optional backend
+            if find_spec(
+                AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
+            ):
                 from vllm.v1.attention.backends.rocm_aiter_fa import (
                     AiterFlashAttentionMetadata,
                 )
@@ -275,7 +279,7 @@ class EagleProposer:
         if self.supports_mm_inputs:
             mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
 
-            self.inputs_embeds[:num_tokens] = self.model.get_input_embeddings(
+            self.inputs_embeds[:num_tokens] = self.model.embed_input_ids(
                 self.input_ids[:num_tokens],
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -443,9 +447,7 @@ class EagleProposer:
             self._set_positions(batch_size, clamped_positions)
             self.hidden_states[:batch_size] = hidden_states
             if self.supports_mm_inputs:
-                self.inputs_embeds[:batch_size] = self.model.get_input_embeddings(
-                    input_ids
-                )
+                self.inputs_embeds[:batch_size] = self.model.embed_input_ids(input_ids)
 
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:input_batch_size]
@@ -968,9 +970,7 @@ class EagleProposer:
             # text-only draft models
             try:
                 dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device)
-                self.model.get_input_embeddings(
-                    dummy_input_ids, multimodal_embeddings=None
-                )
+                self.model.embed_input_ids(dummy_input_ids, multimodal_embeddings=None)
             except (NotImplementedError, AttributeError, TypeError):
                 logger.warning(
                     "Draft model does not support multimodal inputs, "
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 5aebfec06dfd5..ceb1cf64b5889 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -12,9 +12,6 @@ from vllm.model_executor.model_loader import get_model
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-
 logger = init_logger(__name__)
 
 
@@ -31,15 +28,6 @@ class CPUModelRunner(GPUModelRunner):
 
         self._postprocess_tensors()
 
-    # Note: Remove the override after new attention backend finished
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
-        if len(self.kv_cache_config.kv_cache_groups) > 1:
-            raise ValueError(
-                "Multiple KVCacheGroups is not"
-                "currently supported with CPU model runner."
-            )
-        super()._may_reorder_batch(scheduler_output)
-
     def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
         def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None:
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
new file mode 100644
index 0000000000000..00bc909df2975
--- /dev/null
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Define EC connector functionality mixin for model runners.
+"""
+
+from collections.abc import Generator
+from contextlib import AbstractContextManager, contextmanager, nullcontext
+from typing import (
+    TYPE_CHECKING,  # noqa: UP035
+)
+
+import torch
+
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorBase
+from vllm.logger import init_logger
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+# Defined as a EC connector functionality mixin for ModelRunner (GPU, TPU)
+class ECConnectorModelRunnerMixin:
+    @staticmethod
+    def maybe_save_ec_to_connector(
+        encoder_cache: dict[str, torch.Tensor],
+        mm_hash: str,
+    ):
+        if not has_ec_transfer():
+            logger.debug("Not have ec transfer please check")
+            return
+        connector = get_ec_transfer()
+        connector.save_caches(encoder_cache=encoder_cache, mm_hash=mm_hash)
+
+    @staticmethod
+    def get_finished_ec_transfers(
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[set[str] | None, set[str] | None]:
+        if has_ec_transfer():
+            return get_ec_transfer().get_finished(scheduler_output.finished_req_ids)
+        return None, None
+
+    @staticmethod
+    def maybe_get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> AbstractContextManager[ECConnectorOutput | None]:
+        return (
+            ECConnectorModelRunnerMixin._get_ec_connector_output(
+                scheduler_output, encoder_cache, **kwargs
+            )
+            if has_ec_transfer()
+            else nullcontext()
+        )
+
+    # This context manager must be used within an active forward context.
+    # It encapsulates the entire EC conector lifecycle within execute_model
+    @staticmethod
+    @contextmanager
+    def _get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> Generator[ECConnectorOutput, None, None]:
+        output = ECConnectorOutput()
+
+        ec_connector = get_ec_transfer()
+        assert isinstance(ec_connector, ECConnectorBase)
+        assert scheduler_output.ec_connector_metadata is not None
+        ec_connector.bind_connector_metadata(scheduler_output.ec_connector_metadata)
+
+        if not ec_connector.is_producer:
+            ec_connector.start_load_caches(encoder_cache, **kwargs)
+
+        try:
+            yield output
+        finally:
+            output.finished_sending, output.finished_recving = (
+                ec_connector.get_finished(scheduler_output.finished_req_ids)
+            )
+
+            ec_connector.clear_connector_metadata()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6fccf2ea2f47c..c9c64137ca04b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -35,6 +35,7 @@ from vllm.config import (
     get_layers_from_vllm_config,
     update_config,
 )
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
@@ -114,12 +115,14 @@ from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncModelRunnerOutput,
     DraftTokenIds,
+    ECConnectorOutput,
     KVConnectorOutput,
     LogprobsLists,
     LogprobsTensors,
     ModelRunnerOutput,
     PoolerOutput,
     SamplerOutput,
+    make_empty_encoder_model_runner_output,
 )
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
@@ -134,6 +137,7 @@ from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
+from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -237,9 +241,12 @@ class ExecuteModelState(NamedTuple):
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
     kv_connector_output: KVConnectorOutput | None
+    ec_connector_output: ECConnectorOutput | None
 
 
-class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+class GPUModelRunner(
+    LoRAModelRunnerMixin, KVConnectorModelRunnerMixin, ECConnectorModelRunnerMixin
+):
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -892,38 +899,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
 
     def _init_mrope_positions(self, req_state: CachedRequestState):
-        image_grid_thw = []
-        video_grid_thw = []
-        second_per_grid_ts = []
-        audio_feature_lengths = []
-        use_audio_in_video = False
-        for mm_feature in req_state.mm_features:
-            mm_item = mm_feature.data
-            if mm_item is None:
-                continue
-            mm_input = mm_item.get_data()
-            if (t := mm_input.get("image_grid_thw")) is not None:
-                image_grid_thw.append(t.tolist())
-            if (t := mm_input.get("video_grid_thw")) is not None:
-                video_grid_thw.append(t.tolist())
-            if (t := mm_input.get("second_per_grid_ts")) is not None:
-                second_per_grid_ts.append(t)
-            if (t := mm_input.get("audio_feature_lengths")) is not None:
-                audio_feature_lengths.append(t)
-            if mm_input.get("use_audio_in_video") is True:
-                use_audio_in_video = True
-
-        assert supports_mrope(self.get_model()), "M-RoPE support is not implemented."
+        model = self.get_model()
+        assert supports_mrope(model), "M-RoPE support is not implemented."
 
         req_state.mrope_positions, req_state.mrope_position_delta = (
-            self.model.get_mrope_input_positions(
+            model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
-                hf_config=self.model_config.hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
+                req_state.mm_features,
             )
         )
 
@@ -1871,7 +1853,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         )
                     )
 
-                    micro_batch_outputs = model.get_multimodal_embeddings(
+                    micro_batch_outputs = model.embed_multimodal(
                         **micro_batch_mm_inputs
                     )
 
@@ -1884,7 +1866,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -1898,6 +1880,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 output,
                 is_embed=pos_info.is_embed,
             )
+            logger.debug("Finish execute for mm hash %s", mm_hash)
+            self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
     def _gather_mm_embeddings(
         self,
@@ -2216,25 +2200,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         torch.Tensor,
         IntermediateTensors | None,
         dict[str, Any],
+        ECConnectorOutput | None,
     ]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         is_first_rank = get_pp_group().is_first_rank
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
+        ec_connector_output = None
+
         if (
             self.supports_mm_inputs
             and is_first_rank
             and not self.model_config.is_encoder_decoder
         ):
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
+            with self.maybe_get_ec_connector_output(
+                scheduler_output,
+                encoder_cache=self.encoder_cache,
+            ) as ec_connector_output:
+                self._execute_mm_encoder(scheduler_output)
+                mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
 
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds_scheduled = self.model.get_input_embeddings(
+            inputs_embeds_scheduled = self.model.embed_input_ids(
                 self.input_ids.gpu[:num_scheduled_tokens],
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -2270,7 +2261,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # Some tokens ids may need to become embeds
             if token_ids_idx.numel() > 0:
                 token_ids = self.input_ids.gpu[token_ids_idx]
-                tokens_to_embeds = self.model.get_input_embeddings(input_ids=token_ids)
+                tokens_to_embeds = self.model.embed_input_ids(input_ids=token_ids)
                 self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
 
             inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
@@ -2309,6 +2300,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             positions,
             intermediate_tensors,
             model_kwargs,
+            ec_connector_output,
         )
 
     def _sample(
@@ -2533,6 +2525,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # Update persistent batch states.
                 self._update_states(scheduler_output)
 
+                if has_ec_transfer() and get_ec_transfer().is_producer:
+                    with self.maybe_get_ec_connector_output(
+                        scheduler_output,
+                        encoder_cache=self.encoder_cache,
+                    ) as ec_connector_output:
+                        self._execute_mm_encoder(scheduler_output)
+                        return make_empty_encoder_model_runner_output(scheduler_output)
+
                 if not num_scheduled_tokens:
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
@@ -2608,6 +2608,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 positions,
                 intermediate_tensors,
                 model_kwargs,
+                ec_connector_output,
             ) = self._preprocess(
                 scheduler_output, num_input_tokens, intermediate_tensors
             )
@@ -2724,6 +2725,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             sample_hidden_states,
             aux_hidden_states,
             kv_connector_output,
+            ec_connector_output,
         )
         return None
 
@@ -2745,6 +2747,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             sample_hidden_states,
             aux_hidden_states,
             kv_connector_output,
+            ec_connector_output,
         ) = self.execute_model_state
         # Clear ephemeral state.
         self.execute_model_state = None
@@ -2836,6 +2839,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 prompt_logprobs_dict=prompt_logprobs_dict,
                 pooler_output=[],
                 kv_connector_output=kv_connector_output,
+                ec_connector_output=ec_connector_output
+                if self.supports_mm_inputs
+                else None,
                 num_nans_in_logits=num_nans_in_logits,
             )
 
@@ -3883,7 +3889,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     )
 
                     # Run multimodal encoder.
-                    dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = self.model.embed_multimodal(
                         **batched_dummy_mm_inputs
                     )
 
@@ -4161,14 +4167,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return attn_groups
 
         attention_backend_maps = []
-        attention_backend_set: set[type[AttentionBackend]] = set()
+        attention_backend_list = []
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
             attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
             attention_backend_maps.append(attn_backends[0])
-            attention_backend_set.update(attn_backends[1])
+            attention_backend_list.append(attn_backends[1])
 
         # Resolve cudagraph_mode before actually initialize metadata_builders
-        self._check_and_update_cudagraph_mode(attention_backend_set)
+        self._check_and_update_cudagraph_mode(
+            attention_backend_list, kv_cache_config.kv_cache_groups
+        )
 
         for i, attn_backend_map in enumerate(attention_backend_maps):
             self.attn_groups.append(create_attn_groups(attn_backend_map, i))
@@ -4197,22 +4205,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.calculate_reorder_batch_threshold()
 
     def _check_and_update_cudagraph_mode(
-        self, attention_backends: set[type[AttentionBackend]]
+        self,
+        attention_backends: list[set[type[AttentionBackend]]],
+        kv_cache_groups: list[KVCacheGroupSpec],
     ) -> None:
         """
         Resolve the cudagraph_mode when there are multiple attention
-        backends with potential conflicting CUDA graph support.
+        groups with potential conflicting CUDA graph support.
         Then initialize the cudagraph_dispatcher based on the resolved
         cudagraph_mode.
         """
         min_cg_support = AttentionCGSupport.ALWAYS
         min_cg_backend_name = None
 
-        for attn_backend in attention_backends:
-            builder_cls = attn_backend.get_builder_cls()
-            if builder_cls.cudagraph_support.value < min_cg_support.value:
-                min_cg_support = builder_cls.cudagraph_support
-                min_cg_backend_name = attn_backend.__name__
+        for attn_backend_set, kv_cache_group in zip(
+            attention_backends, kv_cache_groups
+        ):
+            for attn_backend in attn_backend_set:
+                builder_cls = attn_backend.get_builder_cls()
+
+                cg_support = builder_cls.get_cudagraph_support(
+                    self.vllm_config, kv_cache_group.kv_cache_spec
+                )
+                if cg_support.value < min_cg_support.value:
+                    min_cg_support = cg_support
+                    min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
         # check cudagraph for mixed batch is supported
@@ -4371,7 +4388,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             """
             for backend in backends:
                 is_supported = False
-                for supported_size in backend.get_supported_kernel_block_size():
+                for supported_size in backend.supported_kernel_block_sizes:
                     if isinstance(supported_size, int):
                         if block_size == supported_size:
                             is_supported = True
@@ -4402,7 +4419,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         all_int_supported_sizes = set(
             supported_size
             for backend in backends
-            for supported_size in backend.get_supported_kernel_block_size()
+            for supported_size in backend.supported_kernel_block_sizes
             if isinstance(supported_size, int)
         )
 
@@ -4822,7 +4839,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
-
+        if has_ec_transfer() and get_ec_transfer().is_producer:
+            return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
         for layer_name, attn_module in attn_layers.items():
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 19061fcffdf1a..2b9d8bb2f25e6 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -20,6 +20,7 @@ from vllm.distributed import (
     init_distributed_environment,
     set_custom_all_reduce,
 )
+from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
 from vllm.distributed.kv_transfer import (
     ensure_kv_transfer_initialized,
     get_kv_transfer_group,
@@ -887,3 +888,7 @@ def init_worker_distributed_environment(
         parallel_config.pipeline_parallel_size,
         parallel_config.decode_context_parallel_size,
     )
+
+    # Init ec connector here before KV caches caches init
+    # NOTE: We do not init KV caches for Encoder-only instance in EPD disagg mode
+    ensure_ec_transfer_initialized(vllm_config)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 26816ce0f2091..0f90578671db5 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -962,7 +962,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
             torch_xla.sync(wait=False)
-            curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+            curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             torch_xla.sync(wait=False)
 
             sanity_check_mm_encoder_outputs(
@@ -1065,7 +1065,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds = self.model.get_input_embeddings(
+            inputs_embeds = self.model.embed_input_ids(
                 input_ids,
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -1484,14 +1484,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 )
                 # Run multimodal encoder.
                 torch_xla.sync(wait=False)
-                mm_embeds = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs
-                )
+                mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
                 torch_xla.sync(wait=False)
                 num_patches = mm_embeds[0].shape[0]
                 items_size = num_patches * num_items
 
-                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # NOTE (NickLucche) pre-compile `embed_input_ids` when mm
                 # embeddings are present. We assume `--disable-mm-chunked`,
                 # hence only whole items can be scheduled. This implies we just
                 # need to compile when `num_items` fit the (padded) `input_ids`
@@ -1519,7 +1517,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         assert a is None
                         torch_xla.sync(wait=False)
 
-            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # Pre-compile `embed_input_ids` when mm_embeddings are not
             # present. Chunk is only made of text, no mm_placeholders.
             for num_tokens in self.num_tokens_paddings:
                 placeholders_ids = torch.zeros(
@@ -1738,7 +1736,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # impact of recompilation until it's fixed.
                     start = time.perf_counter()
                     torch_xla.sync(wait=False)
-                    dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = self.model.embed_multimodal(
                         **batched_dummy_mm_inputs
                     )
                     torch_xla.sync(wait=False)
@@ -1974,11 +1972,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
         return logits_cloned
 
-    def get_multimodal_embeddings(self, *args, **kwargs):
-        return self.model.get_multimodal_embeddings(*args, **kwargs)
+    def embed_multimodal(self, *args, **kwargs):
+        return self.model.embed_multimodal(*args, **kwargs)
 
-    def get_input_embeddings(self, *args, **kwargs):
-        return self.model.get_input_embeddings(*args, **kwargs)
+    def embed_input_ids(self, *args, **kwargs):
+        return self.model.embed_input_ids(*args, **kwargs)
 
     def prepare_structured_decoding_input(
         self, logits: torch.Tensor, grammar_output: "GrammarOutput"
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 0ca7e81a5c7b8..095407a8b9596 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -177,27 +177,27 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
         f"or a single 3D tensor, but got {type(mm_embeddings)} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )
 
     assert len(mm_embeddings) == expected_num_items, (
         "Expected number of multimodal embeddings to match number of "
         f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )
 
     assert all(e.ndim == 2 for e in mm_embeddings), (
         "Expected multimodal embeddings to be a sequence of 2D tensors, "
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )